knitr::opts_chunk$set(echo = T, warning = F, message = F, cache = T, results = 'hide')

Benchmarking CSIDE on a simulated spatial transcriptomics dataset

library(spacexr)
library(Matrix)
library(devtools)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(dplyr)
source('~/Documents/MIT/Research/Rafalab/Projects/spacexr/AnalysisCSIDE/helper_functions/de_simulation_helper.R')
devtools::load_all()
load_all()
set_global_Q_all()

ref <- readRDS("../../data/Reference/10xCer/reference.rds")
resultsdir <- '../../data/Reference/10xCer/de_simulation'
common_cell_types <- c('Bergmann', 'Purkinje')
n_cell_types = length(common_cell_types)
trials = 77 
n_conditions = 13
boundary = ceiling(n_conditions / 2) 
N_samples = (n_cell_types * trials * n_conditions * (n_cell_types - 1))/2
first_UMI = numeric(N_samples); first_type = character(N_samples); second_type = character(N_samples)
UMI_tot = 1000; UMI_step = UMI_tot / (n_conditions-1)
UMI_tot = round(UMI_step * (n_conditions-1)); UMI1_vec = round(0:(n_conditions-1)*UMI_step)
nUMI = ref@nUMI
UMI1_vec = round(0:(n_conditions-1)*UMI_step)
UMI1 <- rep(UMI1_vec,each = trials)
UMI_vect <- rep(UMI_tot, N_samples)
cell_type_names <- levels(ref@cell_types)

Run RCTD on simulated data (the first step of CSIDE)

puck.original <- SpatialRNA(NULL, ref@counts, nUMI = ref@nUMI, T)
print('create.RCTD: getting regression differentially expressed genes: ')
cell_type_info <- list(info = process_cell_type_info(ref, cell_type_names, CELL_MIN = 25), renorm = NULL)
gene_list= get_de_genes(cell_type_info$info, puck.original)
puck <- generate_sim_puck(common_cell_types, gene_list, ref)
reference <- restrict_reference_cell_types(ref, common_cell_types) # only relevant cell types
myRCTD <- create.RCTD(puck, reference, max_cores = 1)
myRCTD <- run.RCTD(myRCTD)
saveRDS(myRCTD, file.path(resultsdir, 'myRCTD.rds'))

Load in RCTD results

myRCTD <- readRDS(file.path(resultsdir, 'myRCTD.rds'))
puck <- myRCTD@originalSpatialRNA
sigma_init <- as.character(100*myRCTD@internal_vars$sigma)
results <- myRCTD@results
norm_weights = sweep(results$weights, 1, rowSums(results$weights), '/')
beta <- as.matrix(norm_weights)
cell_type_info <- myRCTD@cell_type_info$info

Run SPARC on contamination simulation

de_gt_vals <- exp((-3:3)/5)
gt_val <- 1
COND <- length(de_gt_vals) # number of conditions for DE
REPLICATES <- 100 # 100
alpha_res <- array(0, dim = c(REPLICATES, 2, COND))
thresh <- 5 #flip 50% of them
region_base <- c(rep(0, floor(N_samples/2)), rep(1, ceiling(N_samples/2)))
region <- region_base
N_int <- 10
region[((1:N_samples) %% N_int) < thresh] <- 1 - region[((1:N_samples) %% 10) < thresh]
de_gene <- 'Nrxn3'
for(condi in 1:COND) {
  de_ground_truth <- c(gt_val, de_gt_vals[condi])
  de_res <- sim_SPARK(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                      UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta,UMI_vect,
                      other_methods = T, region_orig = region)
  alpha_res[,, condi] <- de_res
}
saveRDS(alpha_res, 'spark-contam.rds')

Plot contamination simulation results

alpha_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/alpha_res.rds'))
dec_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/dec_res.rds'))
bulk_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/bulk_res.rds'))
sing_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/sing_res.rds'))
spark_res <- readRDS(file.path(resultsdir, 'spark/spark-contam.rds'))
REPLICATES <- 100
de_gt_vals <- exp((-3:3)/5)
gt_val <- 1
plot_df <- as.data.frame(cbind( log(gt_val), log(de_gt_vals), apply(alpha_res,3:2,mean), apply(spark_res, 3:2, mean), apply(dec_res,3:2,mean), apply(bulk_res,2,mean), apply(sing_res,3:2,mean), apply(alpha_res,3:2,sd),apply(spark_res, 3:2, sd), apply(dec_res,3:2,sd), apply(bulk_res,2,sd), apply(sing_res,3:2,sd)))
colnames(plot_df) <- c('log_fc_b', 'log_fc_p', 'CSIDE_b', 'CSIDE_p', 'SPARK_b', 'SPARK_p','dec_b', 'dec_p', 'bulk','sing_b','sing_p', 'sd_CSIDE_b','sd_CSIDE_p','sd_SPARK_b','sd_SPARK_p', 'sd_dec_b','sd_dec_p','sd_bulk', 'sd_sing_b','sd_sing_p')
graph_df <- plot_df %>% 
  select(log_fc_p, CSIDE_b, sd_CSIDE_b) %>% 
  rename(estimate=CSIDE_b, sd=sd_CSIDE_b) %>%
  mutate(type = 'CSIDE_b') %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, CSIDE_p, sd_CSIDE_p) %>% 
              rename(estimate=CSIDE_p, sd=sd_CSIDE_p) %>%
              mutate(type = 'CSIDE_p')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, SPARK_p, sd_SPARK_p) %>% 
              rename(estimate=SPARK_p, sd=sd_SPARK_p) %>%
              mutate(type = 'SPARK_p')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, SPARK_b, sd_SPARK_b) %>% 
              rename(estimate=SPARK_b, sd=sd_SPARK_b) %>%
              mutate(type = 'SPARK_b')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, dec_b, sd_dec_b) %>% 
              rename(estimate=dec_b, sd=sd_dec_b) %>%
              mutate(type = 'dec_b')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, dec_p, sd_dec_p) %>% 
              rename(estimate=dec_p, sd=sd_dec_p) %>%
              mutate(type = 'dec_p')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, bulk, sd_bulk) %>% 
              rename(estimate=bulk, sd=sd_bulk) %>%
              mutate(type = 'bulk')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, sing_p, sd_sing_p) %>% 
              rename(estimate=sing_p, sd=sd_sing_p) %>%
              mutate(type = 'sing_p')) %>%
bind_rows(plot_df %>% 
              select(log_fc_p, sing_b, sd_sing_b) %>% 
              rename(estimate=sing_b, sd=sd_sing_b) %>%
              mutate(type = 'sing_b'))
graph_df[,1:3] <- graph_df[,1:3]*log(exp(1),2) #scale to log 2
p1 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_p', 'dec_p', 'bulk', 'sing_p','SPARK_p'),], aes(x=log_fc_p,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=1,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03) + ylab('Estimated cell type A DE')+ xlab('Differential expression in cell type A') + ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00', '#EFCB00','#CC79A7'), breaks = c('bulk', 'dec_p' ,'CSIDE_p', 'sing_p','SPARK_p'), labels = c('Bulk','Decompose','CSIDE','Single','SPARK')) + scale_x_continuous(breaks = unique(graph_df$log_fc_p), labels = round(unique(graph_df$log_fc_p),2))
p2 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_b', 'dec_b', 'bulk', 'sing_b', 'SPARK_b'),], aes(x=log_fc_p,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=0,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03)+ ylab('Estimated cell type B DE')+ xlab('Differential expression in the other cell type')+ ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00','#EFCB00', '#CC79A7'), breaks = c('bulk', 'dec_b' ,'CSIDE_b', 'sing_b','SPARK_b'), labels = c('Bulk','Decompose','CSIDE', 'Single','SPARK')) + scale_x_continuous(breaks = unique(graph_df$log_fc_p), labels = round(unique(graph_df$log_fc_p),2))
ggarrange(p1,p2, nrow = 2)

Run SPARK on cell type proportion simulation

de_ground_truth <- c(1,1)
de_gene <- 'Astn2'
de_gt_vals <- 2:8 #cell type proportions
N_int <- 10
region_base <- c(rep(0, floor(N_samples/2)), rep(1, ceiling(N_samples/2)))
COND <- length(de_gt_vals) # number of conditions for DE
REPLICATES <- 100
alpha_res <- array(0, dim = c(REPLICATES, 2, COND))
for(condi in 1:COND) {
  print(condi)
  thresh <- de_gt_vals[condi]
  region <- region_base
  region[((1:N_samples) %% N_int) < thresh] <- 1 - region[((1:N_samples) %% 10) < thresh]
  de_res <- sim_SPARK(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                      UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta, UMI_vect,
                      other_methods = T,region_orig = region)
  alpha_res[,, condi] <- de_res
}
saveRDS(alpha_res,'spark_proportion.rds')

Plot cell type proportion simulation results

REPLICATES <- 100
de_ground_truth <- c(1,1)
de_gt_vals <- 2:8
alpha_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/alpha_res.rds'))
dec_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/dec_res.rds'))
bulk_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/bulk_res.rds'))
sing_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/sing_res.rds'))
spark_res <- readRDS(file.path(resultsdir, 'spark/spark_proportion.rds'))
N_int <- 10
ct_prop <- de_gt_vals / N_int
plot_df <- as.data.frame(cbind(ct_prop, log(de_ground_truth[1]), log(de_ground_truth[2]), apply(alpha_res,3:2,mean), apply(spark_res, 3:2, mean),apply(dec_res,3:2,mean), apply(bulk_res,2,mean), apply(sing_res,3:2,mean), apply(alpha_res,3:2,sd), apply(spark_res, 3:2, sd),apply(dec_res,3:2,sd), apply(bulk_res,2,sd), apply(sing_res,3:2,sd)))
colnames(plot_df) <- c('ct_prop','log_fc_b', 'log_fc_p', 'CSIDE_b', 'CSIDE_p','SPARK_b', 'SPARK_p', 'dec_b', 'dec_p', 'bulk', 'sing_b', 'sing_p', 'sd_CSIDE_b','sd_CSIDE_p','sd_SPARK_b','sd_SPARK_p', 'sd_dec_b','sd_dec_p','sd_bulk', 'sd_sing_b', 'sd_sing_p')
graph_df <- plot_df %>% 
  select(ct_prop, CSIDE_b, sd_CSIDE_b) %>% 
  rename(estimate=CSIDE_b, sd=sd_CSIDE_b) %>%
  mutate(type = 'CSIDE_b') %>%
  bind_rows(plot_df %>% 
              select(ct_prop, CSIDE_p, sd_CSIDE_p) %>% 
              rename(estimate=CSIDE_p, sd=sd_CSIDE_p) %>%
              mutate(type = 'CSIDE_p')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, SPARK_p, sd_SPARK_p) %>% 
              rename(estimate=SPARK_p, sd=sd_SPARK_p) %>%
              mutate(type = 'SPARK_p')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, SPARK_b, sd_SPARK_b) %>% 
              rename(estimate=SPARK_b, sd=sd_SPARK_b) %>%
              mutate(type = 'SPARK_b')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, dec_b, sd_dec_b) %>% 
              rename(estimate=dec_b, sd=sd_dec_b) %>%
              mutate(type = 'dec_b')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, dec_p, sd_dec_p) %>% 
              rename(estimate=dec_p, sd=sd_dec_p) %>%
              mutate(type = 'dec_p')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, bulk, sd_bulk) %>% 
              rename(estimate=bulk, sd=sd_bulk) %>%
              mutate(type = 'bulk')) %>%
bind_rows(plot_df %>% 
              select(ct_prop, sing_p, sd_sing_p) %>% 
              rename(estimate=sing_p, sd=sd_sing_p) %>%
              mutate(type = 'sing_p')) %>%
bind_rows(plot_df %>% 
              select(ct_prop, sing_b, sd_sing_b) %>% 
              rename(estimate=sing_b, sd=sd_sing_b) %>%
              mutate(type = 'sing_b'))
graph_df[,2:3] <- graph_df[,2:3]*log(exp(1),2) #scale to log 2
graph_df$ct_prop <- 2*(graph_df$ct_prop - 0.5)
p1 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_p', 'dec_p', 'bulk', 'sing_p','SPARK_p'),], aes(x=ct_prop,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=0,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03)  + ylab('Estimated cell type A DE')+ xlab('Difference of mean cell type proportion across regions') + ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00','#EFCB00', '#CC79A7'), breaks = c('bulk', 'dec_p' ,'CSIDE_p', 'sing_p', 'SPARK_p'), labels = c('Bulk','Decompose','CSIDE','Single', 'SPARK')) + scale_x_continuous(breaks = unique(graph_df$ct_prop), labels = round(unique(graph_df$ct_prop),2))
p2 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_b', 'dec_b', 'bulk', 'sing_b', 'SPARK_b'),], aes(x=ct_prop,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=0,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03) + ylab('Estimated cell type B DE')+ xlab('Difference of mean cell type proportion across regions') + ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00','#EFCB00', '#CC79A7'), breaks = c('bulk', 'dec_b' ,'CSIDE_b','sing_b', 'SPARK_b'), labels = c('Bulk','Decompose','CSIDE','Single','SPARK'))+ scale_x_continuous(breaks = unique(graph_df$ct_prop), labels = round(unique(graph_df$ct_prop),2))
ggarrange(p1,p2, nrow = 2)

Run CSIDE on null and non-null DE case

gene_list <- rownames(myRCTD@spatialRNA@counts)
N_genes <- 15
for(non_null in c(T,F)) {
  high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
  if(T)
    high_genes <- high_genes[which(cell_type_info[[1]][high_genes, common_cell_types][,1] < 3e-2 & cell_type_info[[1]][high_genes, common_cell_types][,2] < 3e-2)]

  set.seed(123)
  cur_gene_list <- sample(high_genes,N_genes)
  if(!non_null) {
    REPLICATES <- 500
    cur_gene_list <- sample(high_genes, N_genes)
  } else {
    REPLICATES <- 100
  }
  e_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list)))
  s_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list)))
  de_ground_truth <- c(1,1)
  if(non_null) {
    de_vals <- (((1:length(cur_gene_list)) - 1)/(length(cur_gene_list) - 1))*2-1
  } else {
    de_vals <- rep(0, length(cur_gene_list))
  }
  for(i in 1:length(cur_gene_list)) {
    print(i)
    de_gene <- cur_gene_list[i]
    de_ground_truth[2] <- exp(de_vals[i])
    de_res <- sim_CSIDE(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                         UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta, UMI_vect) 
    e_all[,, i] <- de_res$e_res
    s_all[,,i] <- de_res$s_res
  }
  if(non_null) {
    saveRDS(e_all, file.path(resultsdir, 'non_null_mean/e_all.rds'))
    saveRDS(s_all, file.path(resultsdir, 'non_null_mean/s_all.rds'))
  } else {
    saveRDS(e_all, file.path(resultsdir, 'null_mean/e_all.rds'))
    saveRDS(s_all, file.path(resultsdir, 'null_mean/s_all.rds'))
  }
}

Plot null DE estimation results

gene_list <- rownames(myRCTD@spatialRNA@counts) 
high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
N_genes <- 15
REPLICATES <- 500
non_null <- F
set.seed(123)
cur_gene_list <- sample(high_genes,N_genes)
if(!non_null) {
  cur_gene_list <- sample(high_genes, N_genes)
} 
de_vals <- rep(0, length(cur_gene_list))
if(non_null) {
  e_all <- readRDS(file.path(resultsdir, 'non_null_mean/e_all.rds'))
  s_all <- readRDS(file.path(resultsdir, 'non_null_mean/s_all.rds'))
} else {
  e_all <- readRDS(file.path(resultsdir, 'null_mean/e_all.rds'))
  s_all <- readRDS(file.path(resultsdir, 'null_mean/s_all.rds'))
}
se <- apply(e_all[,2,], 2,sd)/sqrt(REPLICATES)
results <- colMeans(e_all[,2,])
plot_df <- data.frame(cur_gene_list[1:length(results)], results, se, de_vals)
colnames(plot_df) <- c('gene', 'mean', 'se', 'de')
plot_df[,c('mean','se')] <- plot_df[,c('mean','se')] * log(exp(1),2) # rescale to log 2
if(! non_null) {
  p <- ggplot(plot_df,aes(x=gene, y = mean)) + geom_point() + geom_errorbar(aes(x=gene,ymin = mean-1.96*se, ymax = mean + 1.96*se), width = .2)+ geom_hline(yintercept = 0) + theme_classic()  + theme(axis.text.x = element_text(angle=25,hjust = 1)) + xlab('Gene') + ylab('Log CSIDE estimated differential expression')#+ ggrepel::geom_label_repel(aes(label = gene),nudge_x = 0.15,na.rm = TRUE)
  p
} else {
  p <- ggplot(plot_df,aes(x=de, y = mean)) + geom_point() + geom_errorbar(aes(ymin = mean-1.96*se, ymax = mean + 1.96*se), width = 0.05) + geom_line(aes(x = de, y = de)) + theme_classic() + ggrepel::geom_label_repel(aes(label = gene),nudge_x = 0.15,na.rm = TRUE) 
  p
}

Plot p-value enrichment

z_all <- e_all / s_all
p_res <- (2*(1 - pnorm(abs(z_all))))
alpha_vals = c(.0025,.005,.01, .02, .03, .05, .1, .2)
enrichment = numeric(length(alpha_vals))
se = numeric(length(alpha_vals))
N_trials <- length(p_res)
for(i in 1:length(alpha_vals)) {
  alpha <- alpha_vals[i]
  p_curr <- (sum(p_res < alpha) / (N_trials))
  enrichment[i] <- p_curr / alpha
  se[i] <- sqrt(p_curr*(1-p_curr)/N_trials)/alpha
}
plot_df <- data.frame(alpha_vals, enrichment, se)
p1 <- ggplot(plot_df, aes(x=alpha_vals,y = log(enrichment,2))) + geom_point() + ylim(c(-1.5,1.5)) + theme_classic() + geom_hline(yintercept = 0, linetype = 'dotted') + geom_errorbar(aes(ymin = log(enrichment - 1.96*se,2), ymax = log(enrichment + 1.96*se,2)), width = .005) + xlab('Significance level') + ylab('Log ratio of observed vs expected false positive rate')
p1

Plot standard error prediction

errors <- sweep(e_all,3,de_vals,'-')
vars <- s_all^2
errors <- errors[order(vars)]^2
vars <- vars[order(vars)] 
bin_size <- 500
N_bins <- length(vars) / bin_size
pred_list <- numeric(N_bins)
obs_list <- numeric(N_bins)
se_list <- numeric(N_bins)
for(i in 1:N_bins) {
  offset <- bin_size * (i-1)
  pred_list[i] <- mean(vars[(1 + offset):(100+offset)])
  obs_list[i] <- mean(errors[(1 + offset):(100+offset)])
  se_list[i] <- sd(errors[(1 + offset):(100+offset)])/sqrt(bin_size)
}
plot_df <- data.frame(pred_list, obs_list, se_list)
colnames(plot_df) <- c('pred', 'obs', 'se')
plot_df <- plot_df * log(exp(1),2)
p <- ggplot(plot_df) + geom_point(aes(sqrt(pred), sqrt(obs))) + geom_errorbar(aes(x = sqrt(pred), ymin = sqrt(obs - 1.96*se), ymax = sqrt(obs + 1.96*se))) + geom_abline() + theme_classic() + xlim(c(0,0.4)) + ylim(c(0,0.4)) + xlab('CSIDE predicted standard error') + ylab('Average observed standard error')
p 

Run power analysis

gene_list <- rownames(myRCTD@spatialRNA@counts)
high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
high_genes <- high_genes[which(cell_type_info[[1]][high_genes, common_cell_types][,1] < 3e-2 & cell_type_info[[1]][high_genes, common_cell_types][,2] < 3e-2)]

REPLICATES <- 100 
DE_CONDITIONS <- 7
de_vals <- (((1:DE_CONDITIONS) - 1)/(DE_CONDITIONS - 1))*2-1
de_ground_truth <- c(1,1)
cur_gene_list <- high_genes
NUM_CELLS <- c(250, 500, 1000)
e_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list), length(NUM_CELLS), DE_CONDITIONS))
s_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list), length(NUM_CELLS), DE_CONDITIONS))
dimnames(e_all) <- list(NULL,NULL,high_genes,NULL,NULL)
dimnames(s_all) <- list(NULL,NULL,high_genes,NULL,NULL)
for(k in 1:DE_CONDITIONS) {
  print(paste("DE",k))
  for(j in 1:length(NUM_CELLS)) {
    print(paste('Ncells',j))
    for(i in 1:length(cur_gene_list)) {
      print(paste('gene',i))
      de_gene <- cur_gene_list[i]
      de_ground_truth[2] <- exp(de_vals[k])
      de_res <- sim_CSIDE(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                           UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta, UMI_vect,
                           subset_cells = NUM_CELLS[j], regularize_expr = T)
      e_all[,, i, j , k] <- de_res$e_res
      s_all[,,i, j, k] <- de_res$s_res
    }
  }
}
saveRDS(e_all, file.path(resultsdir, 'power/e_all.rds'))
saveRDS(s_all, file.path(resultsdir, 'power/s_all.rds'))

Plot power analysis

NUM_CELLS <- c(250, 500, 1000)
DE_CONDITIONS <- 7
de_vals <- (((1:DE_CONDITIONS) - 1)/(DE_CONDITIONS - 1))*2-1
gene_list <- rownames(myRCTD@spatialRNA@counts)
high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
high_genes <- high_genes[which(cell_type_info[[1]][high_genes, common_cell_types][,1] < 3e-2 & cell_type_info[[1]][high_genes, common_cell_types][,2] < 3e-2)]
e_all <- readRDS(file.path(resultsdir, 'power/e_all.rds'))
s_all <- readRDS(file.path(resultsdir, 'power/s_all.rds'))
z_all <- e_all / s_all 
p_all <- 2*(1 - pnorm(abs(z_all)))
alpha <- 0.01
power_estimates <- apply(p_all[,2,,,] < 0.01,c(2,3,4),mean)
purk_expr <- log(cell_type_info[[1]][high_genes,'Purkinje'])
plot_df <- data.frame(purk_expr, rowMeans(power_estimates[,1,]))
colnames(plot_df) <- c('expr','power')
R2 <- cor(plot_df$expr, plot_df$power)^2
p1 <- ggplot(plot_df, aes(x = expr, y = power)) + geom_point() + ggtitle(paste('R2=',R2)) + ylim(c(0,1)) + theme_classic()
my_order <- order(purk_expr)
pre_df <- as.data.frame(t(power_estimates[my_order,1,]))
pre_df$de <- de_vals
pre_df$ncell <- NUM_CELLS[1]
plot_df1 <- reshape2::melt(pre_df, id = c('de', 'ncell'))
pre_df <- as.data.frame(t(power_estimates[my_order,2,]))
pre_df$de <- de_vals
pre_df$ncell <- NUM_CELLS[2]
plot_df2 <- reshape2::melt(pre_df, id = c('de', 'ncell'))
pre_df <- as.data.frame(t(power_estimates[my_order,3,]))
pre_df$de <- de_vals
pre_df$ncell <- NUM_CELLS[3]
plot_df3 <- reshape2::melt(pre_df, id = c('de', 'ncell'))
plot_df <- rbind(plot_df1, plot_df2, plot_df3)
norm_purk_expr <- purk_expr - min(purk_expr)
norm_purk_expr <- norm_purk_expr / max(norm_purk_expr)
plot_df <- rbind(plot_df,
                 data.frame('de' = 0, 'ncell' = "Expression", 'variable' = high_genes,'value' = norm_purk_expr))
plot_df$ncell <- factor(plot_df$ncell, levels = c('Expression','250','500','1000'))
plot_df$label = round(plot_df$value, 2)
plot_df$label[plot_df$ncell == 'Expression'] <- NA
plot_df$de <- plot_df$de*log(exp(1),2)
my_labels <- names(table(plot_df$de))
p2 <- ggplot(plot_df[plot_df$ncell != 'Expression',], aes(y = variable, x = de, fill = value)) + facet_wrap(ncell ~., nrow = 1) + geom_raster() + theme_classic() +
  scale_fill_continuous("Power",high = "#132B43", low = "#56B1F7") + geom_text(aes(label = label), size = 2, color = 'white') + scale_x_continuous(breaks=round(as.numeric(my_labels),2)) + ylab("Gene") + xlab("Cell type A differential expression") + theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
p3 <- ggplot(plot_df[plot_df$ncell == 'Expression',], aes(y = variable, x = de, fill = value)) + facet_wrap(ncell ~., nrow = 1) + geom_raster() + theme_classic() + scale_x_continuous(breaks=round(as.numeric(my_labels),2)) +scale_fill_continuous("Log cell type A expression",high = pals::brewer.reds(20)[20], low = pals::brewer.reds(20)[2], breaks = c(0,1), labels =round(c(min(purk_expr), max(purk_expr))*log(exp(1),2),2)) + xlab("") + ylab("Gene")+ theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
ggarrange(p2,p3,nrow = 1, widths = c(3.3,1))
p_my <- p_all[,2,,,]
ind <- 3 # DE
alpha <- 0.05
ncells <- 3
alpha_vals <- c((1:1000)/1000)
alpha_vals <- c(alpha_vals / 1000^3, alpha_vals / 1000^2, alpha_vals / 1000, alpha_vals)
all_res <- array(dim = c(2,3,4,length(alpha_vals))) # fpr/tpr ncells ind alpha_vals
for(k in 1:2) {
  for(ncells in 1:3) {
    for(ind in 1:4) {
      fpr <- numeric(length(alpha_vals))
      tpr <- numeric(length(alpha_vals))
      for(i in 1:length(alpha_vals)) {
        alpha <- alpha_vals[i]
        all_res[2,ncells,ind,i] <- mean(p_my[,,ncells,c(ind, 8 - ind)] < alpha)
        all_res[1,ncells,ind,i] <- mean(p_my[,,ncells,4] < alpha)
      }
    }
  }
}
plot_df <- melt(all_res[,,1:3,])
plot_df <- cbind(plot_df[(1:(dim(plot_df)[1]/2))*2,], plot_df[(1:(dim(plot_df)[1]/2))*2 - 1,])
plot_df <- plot_df[c(1,2,3,4,5,10)]
colnames(plot_df) <- c('type', 'cells', 'ind', 'aind', 'tpr', 'fpr')
#plot_df$ind <- factor(plot_df$ind)
plot_df$cells <- (2^plot_df$cells*125)
plot_df <- bind_rows(data.frame(type=c(2,2,2),cells=c(250,500,1000),ind=c(3,3,3),aind=c(3,3,3),tpr=c(0,0,0),fpr=c(0,0,0)), plot_df)
plot_df$ind <- factor(plot_df$ind)
plot_df$cells <- factor(plot_df$cells)
ggplot(plot_df, aes(x=fpr, y = tpr, color = ind)) + 
  geom_line() + facet_wrap(cells ~., nrow = 1) + theme_classic() + xlab('False positive rate') + ylab('True positive rate') + 
  scale_color_manual("True DE",values = c('#0072B2','#009E73','#D55E00'), labels = c(1.44,.96,.48))+ coord_fixed() + geom_abline(lty = 'dashed') + xlim(c(0,1)) + ylim(c(0,1))
#plot(fpr, tpr, ylim = c(0,1), main = ind)


dmcable/RCTD documentation built on Feb. 24, 2024, 11:03 p.m.