In dmcable/RCTD: SpatialeXpressionR: Cell type identification and cell type-specific differential expression in spatial transcriptomics

knitr::opts_chunk$set(echo = T, warning = F, message = F, cache = T, results = 'hide')

Benchmarking CSIDE on a simulated spatial transcriptomics dataset

library(spacexr)
library(Matrix)
library(devtools)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(dplyr)
source('~/Documents/MIT/Research/Rafalab/Projects/spacexr/AnalysisCSIDE/helper_functions/de_simulation_helper.R')
devtools::load_all()
load_all()
set_global_Q_all()

ref <- readRDS("../../data/Reference/10xCer/reference.rds")
resultsdir <- '../../data/Reference/10xCer/de_simulation'
common_cell_types <- c('Bergmann', 'Purkinje')
n_cell_types = length(common_cell_types)
trials = 77 
n_conditions = 13
boundary = ceiling(n_conditions / 2) 
N_samples = (n_cell_types * trials * n_conditions * (n_cell_types - 1))/2
first_UMI = numeric(N_samples); first_type = character(N_samples); second_type = character(N_samples)
UMI_tot = 1000; UMI_step = UMI_tot / (n_conditions-1)
UMI_tot = round(UMI_step * (n_conditions-1)); UMI1_vec = round(0:(n_conditions-1)*UMI_step)
nUMI = ref@nUMI
UMI1_vec = round(0:(n_conditions-1)*UMI_step)
UMI1 <- rep(UMI1_vec,each = trials)
UMI_vect <- rep(UMI_tot, N_samples)
cell_type_names <- levels(ref@cell_types)

Run RCTD on simulated data (the first step of CSIDE)

puck.original <- SpatialRNA(NULL, ref@counts, nUMI = ref@nUMI, T)
print('create.RCTD: getting regression differentially expressed genes: ')
cell_type_info <- list(info = process_cell_type_info(ref, cell_type_names, CELL_MIN = 25), renorm = NULL)
gene_list= get_de_genes(cell_type_info$info, puck.original)
puck <- generate_sim_puck(common_cell_types, gene_list, ref)
reference <- restrict_reference_cell_types(ref, common_cell_types) # only relevant cell types
myRCTD <- create.RCTD(puck, reference, max_cores = 1)
myRCTD <- run.RCTD(myRCTD)
saveRDS(myRCTD, file.path(resultsdir, 'myRCTD.rds'))

Load in RCTD results

myRCTD <- readRDS(file.path(resultsdir, 'myRCTD.rds'))
puck <- myRCTD@originalSpatialRNA
sigma_init <- as.character(100*myRCTD@internal_vars$sigma)
results <- myRCTD@results
norm_weights = sweep(results$weights, 1, rowSums(results$weights), '/')
beta <- as.matrix(norm_weights)
cell_type_info <- myRCTD@cell_type_info$info

Run SPARC on contamination simulation

de_gt_vals <- exp((-3:3)/5)
gt_val <- 1
COND <- length(de_gt_vals) # number of conditions for DE
REPLICATES <- 100 # 100
alpha_res <- array(0, dim = c(REPLICATES, 2, COND))
thresh <- 5 #flip 50% of them
region_base <- c(rep(0, floor(N_samples/2)), rep(1, ceiling(N_samples/2)))
region <- region_base
N_int <- 10
region[((1:N_samples) %% N_int) < thresh] <- 1 - region[((1:N_samples) %% 10) < thresh]
de_gene <- 'Nrxn3'
for(condi in 1:COND) {
  de_ground_truth <- c(gt_val, de_gt_vals[condi])
  de_res <- sim_SPARK(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                      UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta,UMI_vect,
                      other_methods = T, region_orig = region)
  alpha_res[,, condi] <- de_res
}
saveRDS(alpha_res, 'spark-contam.rds')

Plot contamination simulation results

alpha_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/alpha_res.rds'))
dec_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/dec_res.rds'))
bulk_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/bulk_res.rds'))
sing_res <- readRDS(file.path(resultsdir, 'other_methods_contamination/sing_res.rds'))
spark_res <- readRDS(file.path(resultsdir, 'spark/spark-contam.rds'))
REPLICATES <- 100
de_gt_vals <- exp((-3:3)/5)
gt_val <- 1
plot_df <- as.data.frame(cbind( log(gt_val), log(de_gt_vals), apply(alpha_res,3:2,mean), apply(spark_res, 3:2, mean), apply(dec_res,3:2,mean), apply(bulk_res,2,mean), apply(sing_res,3:2,mean), apply(alpha_res,3:2,sd),apply(spark_res, 3:2, sd), apply(dec_res,3:2,sd), apply(bulk_res,2,sd), apply(sing_res,3:2,sd)))
colnames(plot_df) <- c('log_fc_b', 'log_fc_p', 'CSIDE_b', 'CSIDE_p', 'SPARK_b', 'SPARK_p','dec_b', 'dec_p', 'bulk','sing_b','sing_p', 'sd_CSIDE_b','sd_CSIDE_p','sd_SPARK_b','sd_SPARK_p', 'sd_dec_b','sd_dec_p','sd_bulk', 'sd_sing_b','sd_sing_p')
graph_df <- plot_df %>% 
  select(log_fc_p, CSIDE_b, sd_CSIDE_b) %>% 
  rename(estimate=CSIDE_b, sd=sd_CSIDE_b) %>%
  mutate(type = 'CSIDE_b') %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, CSIDE_p, sd_CSIDE_p) %>% 
              rename(estimate=CSIDE_p, sd=sd_CSIDE_p) %>%
              mutate(type = 'CSIDE_p')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, SPARK_p, sd_SPARK_p) %>% 
              rename(estimate=SPARK_p, sd=sd_SPARK_p) %>%
              mutate(type = 'SPARK_p')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, SPARK_b, sd_SPARK_b) %>% 
              rename(estimate=SPARK_b, sd=sd_SPARK_b) %>%
              mutate(type = 'SPARK_b')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, dec_b, sd_dec_b) %>% 
              rename(estimate=dec_b, sd=sd_dec_b) %>%
              mutate(type = 'dec_b')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, dec_p, sd_dec_p) %>% 
              rename(estimate=dec_p, sd=sd_dec_p) %>%
              mutate(type = 'dec_p')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, bulk, sd_bulk) %>% 
              rename(estimate=bulk, sd=sd_bulk) %>%
              mutate(type = 'bulk')) %>%
  bind_rows(plot_df %>% 
              select(log_fc_p, sing_p, sd_sing_p) %>% 
              rename(estimate=sing_p, sd=sd_sing_p) %>%
              mutate(type = 'sing_p')) %>%
bind_rows(plot_df %>% 
              select(log_fc_p, sing_b, sd_sing_b) %>% 
              rename(estimate=sing_b, sd=sd_sing_b) %>%
              mutate(type = 'sing_b'))
graph_df[,1:3] <- graph_df[,1:3]*log(exp(1),2) #scale to log 2
p1 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_p', 'dec_p', 'bulk', 'sing_p','SPARK_p'),], aes(x=log_fc_p,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=1,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03) + ylab('Estimated cell type A DE')+ xlab('Differential expression in cell type A') + ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00', '#EFCB00','#CC79A7'), breaks = c('bulk', 'dec_p' ,'CSIDE_p', 'sing_p','SPARK_p'), labels = c('Bulk','Decompose','CSIDE','Single','SPARK')) + scale_x_continuous(breaks = unique(graph_df$log_fc_p), labels = round(unique(graph_df$log_fc_p),2))
p2 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_b', 'dec_b', 'bulk', 'sing_b', 'SPARK_b'),], aes(x=log_fc_p,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=0,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03)+ ylab('Estimated cell type B DE')+ xlab('Differential expression in the other cell type')+ ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00','#EFCB00', '#CC79A7'), breaks = c('bulk', 'dec_b' ,'CSIDE_b', 'sing_b','SPARK_b'), labels = c('Bulk','Decompose','CSIDE', 'Single','SPARK')) + scale_x_continuous(breaks = unique(graph_df$log_fc_p), labels = round(unique(graph_df$log_fc_p),2))
ggarrange(p1,p2, nrow = 2)

Run SPARK on cell type proportion simulation

de_ground_truth <- c(1,1)
de_gene <- 'Astn2'
de_gt_vals <- 2:8 #cell type proportions
N_int <- 10
region_base <- c(rep(0, floor(N_samples/2)), rep(1, ceiling(N_samples/2)))
COND <- length(de_gt_vals) # number of conditions for DE
REPLICATES <- 100
alpha_res <- array(0, dim = c(REPLICATES, 2, COND))
for(condi in 1:COND) {
  print(condi)
  thresh <- de_gt_vals[condi]
  region <- region_base
  region[((1:N_samples) %% N_int) < thresh] <- 1 - region[((1:N_samples) %% 10) < thresh]
  de_res <- sim_SPARK(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                      UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta, UMI_vect,
                      other_methods = T,region_orig = region)
  alpha_res[,, condi] <- de_res
}
saveRDS(alpha_res,'spark_proportion.rds')

Plot cell type proportion simulation results

REPLICATES <- 100
de_ground_truth <- c(1,1)
de_gt_vals <- 2:8
alpha_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/alpha_res.rds'))
dec_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/dec_res.rds'))
bulk_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/bulk_res.rds'))
sing_res <- readRDS(file.path(resultsdir, 'other_methods_proportion/sing_res.rds'))
spark_res <- readRDS(file.path(resultsdir, 'spark/spark_proportion.rds'))
N_int <- 10
ct_prop <- de_gt_vals / N_int
plot_df <- as.data.frame(cbind(ct_prop, log(de_ground_truth[1]), log(de_ground_truth[2]), apply(alpha_res,3:2,mean), apply(spark_res, 3:2, mean),apply(dec_res,3:2,mean), apply(bulk_res,2,mean), apply(sing_res,3:2,mean), apply(alpha_res,3:2,sd), apply(spark_res, 3:2, sd),apply(dec_res,3:2,sd), apply(bulk_res,2,sd), apply(sing_res,3:2,sd)))
colnames(plot_df) <- c('ct_prop','log_fc_b', 'log_fc_p', 'CSIDE_b', 'CSIDE_p','SPARK_b', 'SPARK_p', 'dec_b', 'dec_p', 'bulk', 'sing_b', 'sing_p', 'sd_CSIDE_b','sd_CSIDE_p','sd_SPARK_b','sd_SPARK_p', 'sd_dec_b','sd_dec_p','sd_bulk', 'sd_sing_b', 'sd_sing_p')
graph_df <- plot_df %>% 
  select(ct_prop, CSIDE_b, sd_CSIDE_b) %>% 
  rename(estimate=CSIDE_b, sd=sd_CSIDE_b) %>%
  mutate(type = 'CSIDE_b') %>%
  bind_rows(plot_df %>% 
              select(ct_prop, CSIDE_p, sd_CSIDE_p) %>% 
              rename(estimate=CSIDE_p, sd=sd_CSIDE_p) %>%
              mutate(type = 'CSIDE_p')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, SPARK_p, sd_SPARK_p) %>% 
              rename(estimate=SPARK_p, sd=sd_SPARK_p) %>%
              mutate(type = 'SPARK_p')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, SPARK_b, sd_SPARK_b) %>% 
              rename(estimate=SPARK_b, sd=sd_SPARK_b) %>%
              mutate(type = 'SPARK_b')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, dec_b, sd_dec_b) %>% 
              rename(estimate=dec_b, sd=sd_dec_b) %>%
              mutate(type = 'dec_b')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, dec_p, sd_dec_p) %>% 
              rename(estimate=dec_p, sd=sd_dec_p) %>%
              mutate(type = 'dec_p')) %>%
  bind_rows(plot_df %>% 
              select(ct_prop, bulk, sd_bulk) %>% 
              rename(estimate=bulk, sd=sd_bulk) %>%
              mutate(type = 'bulk')) %>%
bind_rows(plot_df %>% 
              select(ct_prop, sing_p, sd_sing_p) %>% 
              rename(estimate=sing_p, sd=sd_sing_p) %>%
              mutate(type = 'sing_p')) %>%
bind_rows(plot_df %>% 
              select(ct_prop, sing_b, sd_sing_b) %>% 
              rename(estimate=sing_b, sd=sd_sing_b) %>%
              mutate(type = 'sing_b'))
graph_df[,2:3] <- graph_df[,2:3]*log(exp(1),2) #scale to log 2
graph_df$ct_prop <- 2*(graph_df$ct_prop - 0.5)
p1 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_p', 'dec_p', 'bulk', 'sing_p','SPARK_p'),], aes(x=ct_prop,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=0,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03)  + ylab('Estimated cell type A DE')+ xlab('Difference of mean cell type proportion across regions') + ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00','#EFCB00', '#CC79A7'), breaks = c('bulk', 'dec_p' ,'CSIDE_p', 'sing_p', 'SPARK_p'), labels = c('Bulk','Decompose','CSIDE','Single', 'SPARK')) + scale_x_continuous(breaks = unique(graph_df$ct_prop), labels = round(unique(graph_df$ct_prop),2))
p2 <- ggplot(graph_df[graph_df$type %in% c('CSIDE_b', 'dec_b', 'bulk', 'sing_b', 'SPARK_b'),], aes(x=ct_prop,y=estimate,group=type, color = type)) + geom_line() + theme_classic() + geom_abline(slope=0,intercept=0) + geom_errorbar(aes(ymin = estimate - sd*1.96/sqrt(REPLICATES),ymax=estimate+sd*1.96/sqrt(REPLICATES)), width = 0.03) + ylab('Estimated cell type B DE')+ xlab('Difference of mean cell type proportion across regions') + ggplot2::scale_color_manual("Method",values = c('#0072B2','#009E73','#D55E00','#EFCB00', '#CC79A7'), breaks = c('bulk', 'dec_b' ,'CSIDE_b','sing_b', 'SPARK_b'), labels = c('Bulk','Decompose','CSIDE','Single','SPARK'))+ scale_x_continuous(breaks = unique(graph_df$ct_prop), labels = round(unique(graph_df$ct_prop),2))
ggarrange(p1,p2, nrow = 2)

Run CSIDE on null and non-null DE case

gene_list <- rownames(myRCTD@spatialRNA@counts)
N_genes <- 15
for(non_null in c(T,F)) {
  high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
  if(T)
    high_genes <- high_genes[which(cell_type_info[[1]][high_genes, common_cell_types][,1] < 3e-2 & cell_type_info[[1]][high_genes, common_cell_types][,2] < 3e-2)]

  set.seed(123)
  cur_gene_list <- sample(high_genes,N_genes)
  if(!non_null) {
    REPLICATES <- 500
    cur_gene_list <- sample(high_genes, N_genes)
  } else {
    REPLICATES <- 100
  }
  e_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list)))
  s_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list)))
  de_ground_truth <- c(1,1)
  if(non_null) {
    de_vals <- (((1:length(cur_gene_list)) - 1)/(length(cur_gene_list) - 1))*2-1
  } else {
    de_vals <- rep(0, length(cur_gene_list))
  }
  for(i in 1:length(cur_gene_list)) {
    print(i)
    de_gene <- cur_gene_list[i]
    de_ground_truth[2] <- exp(de_vals[i])
    de_res <- sim_CSIDE(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                         UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta, UMI_vect) 
    e_all[,, i] <- de_res$e_res
    s_all[,,i] <- de_res$s_res
  }
  if(non_null) {
    saveRDS(e_all, file.path(resultsdir, 'non_null_mean/e_all.rds'))
    saveRDS(s_all, file.path(resultsdir, 'non_null_mean/s_all.rds'))
  } else {
    saveRDS(e_all, file.path(resultsdir, 'null_mean/e_all.rds'))
    saveRDS(s_all, file.path(resultsdir, 'null_mean/s_all.rds'))
  }
}

Plot null DE estimation results

gene_list <- rownames(myRCTD@spatialRNA@counts) 
high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
N_genes <- 15
REPLICATES <- 500
non_null <- F
set.seed(123)
cur_gene_list <- sample(high_genes,N_genes)
if(!non_null) {
  cur_gene_list <- sample(high_genes, N_genes)
} 
de_vals <- rep(0, length(cur_gene_list))
if(non_null) {
  e_all <- readRDS(file.path(resultsdir, 'non_null_mean/e_all.rds'))
  s_all <- readRDS(file.path(resultsdir, 'non_null_mean/s_all.rds'))
} else {
  e_all <- readRDS(file.path(resultsdir, 'null_mean/e_all.rds'))
  s_all <- readRDS(file.path(resultsdir, 'null_mean/s_all.rds'))
}
se <- apply(e_all[,2,], 2,sd)/sqrt(REPLICATES)
results <- colMeans(e_all[,2,])
plot_df <- data.frame(cur_gene_list[1:length(results)], results, se, de_vals)
colnames(plot_df) <- c('gene', 'mean', 'se', 'de')
plot_df[,c('mean','se')] <- plot_df[,c('mean','se')] * log(exp(1),2) # rescale to log 2
if(! non_null) {
  p <- ggplot(plot_df,aes(x=gene, y = mean)) + geom_point() + geom_errorbar(aes(x=gene,ymin = mean-1.96*se, ymax = mean + 1.96*se), width = .2)+ geom_hline(yintercept = 0) + theme_classic()  + theme(axis.text.x = element_text(angle=25,hjust = 1)) + xlab('Gene') + ylab('Log CSIDE estimated differential expression')#+ ggrepel::geom_label_repel(aes(label = gene),nudge_x = 0.15,na.rm = TRUE)
  p
} else {
  p <- ggplot(plot_df,aes(x=de, y = mean)) + geom_point() + geom_errorbar(aes(ymin = mean-1.96*se, ymax = mean + 1.96*se), width = 0.05) + geom_line(aes(x = de, y = de)) + theme_classic() + ggrepel::geom_label_repel(aes(label = gene),nudge_x = 0.15,na.rm = TRUE) 
  p
}

Plot p-value enrichment

z_all <- e_all / s_all
p_res <- (2*(1 - pnorm(abs(z_all))))
alpha_vals = c(.0025,.005,.01, .02, .03, .05, .1, .2)
enrichment = numeric(length(alpha_vals))
se = numeric(length(alpha_vals))
N_trials <- length(p_res)
for(i in 1:length(alpha_vals)) {
  alpha <- alpha_vals[i]
  p_curr <- (sum(p_res < alpha) / (N_trials))
  enrichment[i] <- p_curr / alpha
  se[i] <- sqrt(p_curr*(1-p_curr)/N_trials)/alpha
}
plot_df <- data.frame(alpha_vals, enrichment, se)
p1 <- ggplot(plot_df, aes(x=alpha_vals,y = log(enrichment,2))) + geom_point() + ylim(c(-1.5,1.5)) + theme_classic() + geom_hline(yintercept = 0, linetype = 'dotted') + geom_errorbar(aes(ymin = log(enrichment - 1.96*se,2), ymax = log(enrichment + 1.96*se,2)), width = .005) + xlab('Significance level') + ylab('Log ratio of observed vs expected false positive rate')
p1

Plot standard error prediction

errors <- sweep(e_all,3,de_vals,'-')
vars <- s_all^2
errors <- errors[order(vars)]^2
vars <- vars[order(vars)] 
bin_size <- 500
N_bins <- length(vars) / bin_size
pred_list <- numeric(N_bins)
obs_list <- numeric(N_bins)
se_list <- numeric(N_bins)
for(i in 1:N_bins) {
  offset <- bin_size * (i-1)
  pred_list[i] <- mean(vars[(1 + offset):(100+offset)])
  obs_list[i] <- mean(errors[(1 + offset):(100+offset)])
  se_list[i] <- sd(errors[(1 + offset):(100+offset)])/sqrt(bin_size)
}
plot_df <- data.frame(pred_list, obs_list, se_list)
colnames(plot_df) <- c('pred', 'obs', 'se')
plot_df <- plot_df * log(exp(1),2)
p <- ggplot(plot_df) + geom_point(aes(sqrt(pred), sqrt(obs))) + geom_errorbar(aes(x = sqrt(pred), ymin = sqrt(obs - 1.96*se), ymax = sqrt(obs + 1.96*se))) + geom_abline() + theme_classic() + xlim(c(0,0.4)) + ylim(c(0,0.4)) + xlab('CSIDE predicted standard error') + ylab('Average observed standard error')
p

Run power analysis

gene_list <- rownames(myRCTD@spatialRNA@counts)
high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
high_genes <- high_genes[which(cell_type_info[[1]][high_genes, common_cell_types][,1] < 3e-2 & cell_type_info[[1]][high_genes, common_cell_types][,2] < 3e-2)]

REPLICATES <- 100 
DE_CONDITIONS <- 7
de_vals <- (((1:DE_CONDITIONS) - 1)/(DE_CONDITIONS - 1))*2-1
de_ground_truth <- c(1,1)
cur_gene_list <- high_genes
NUM_CELLS <- c(250, 500, 1000)
e_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list), length(NUM_CELLS), DE_CONDITIONS))
s_all <- array(0, dim = c(REPLICATES,2,length(cur_gene_list), length(NUM_CELLS), DE_CONDITIONS))
dimnames(e_all) <- list(NULL,NULL,high_genes,NULL,NULL)
dimnames(s_all) <- list(NULL,NULL,high_genes,NULL,NULL)
for(k in 1:DE_CONDITIONS) {
  print(paste("DE",k))
  for(j in 1:length(NUM_CELLS)) {
    print(paste('Ncells',j))
    for(i in 1:length(cur_gene_list)) {
      print(paste('gene',i))
      de_gene <- cur_gene_list[i]
      de_ground_truth[2] <- exp(de_vals[k])
      de_res <- sim_CSIDE(de_ground_truth, REPLICATES, de_gene, ref, N_samples, nUMI, common_cell_types,
                           UMI1, UMI_tot, sigma_init, puck, myRCTD@cell_type_info, beta, UMI_vect,
                           subset_cells = NUM_CELLS[j], regularize_expr = T)
      e_all[,, i, j , k] <- de_res$e_res
      s_all[,,i, j, k] <- de_res$s_res
    }
  }
}
saveRDS(e_all, file.path(resultsdir, 'power/e_all.rds'))
saveRDS(s_all, file.path(resultsdir, 'power/s_all.rds'))

Plot power analysis

NUM_CELLS <- c(250, 500, 1000)
DE_CONDITIONS <- 7
de_vals <- (((1:DE_CONDITIONS) - 1)/(DE_CONDITIONS - 1))*2-1
gene_list <- rownames(myRCTD@spatialRNA@counts)
high_genes <- gene_list[which(cell_type_info[[1]][gene_list, common_cell_types][,1] > 3e-4 & cell_type_info[[1]][gene_list, common_cell_types][,2] > 3e-4)]
high_genes <- high_genes[which(cell_type_info[[1]][high_genes, common_cell_types][,1] < 3e-2 & cell_type_info[[1]][high_genes, common_cell_types][,2] < 3e-2)]
e_all <- readRDS(file.path(resultsdir, 'power/e_all.rds'))
s_all <- readRDS(file.path(resultsdir, 'power/s_all.rds'))
z_all <- e_all / s_all 
p_all <- 2*(1 - pnorm(abs(z_all)))
alpha <- 0.01
power_estimates <- apply(p_all[,2,,,] < 0.01,c(2,3,4),mean)
purk_expr <- log(cell_type_info[[1]][high_genes,'Purkinje'])
plot_df <- data.frame(purk_expr, rowMeans(power_estimates[,1,]))
colnames(plot_df) <- c('expr','power')
R2 <- cor(plot_df$expr, plot_df$power)^2
p1 <- ggplot(plot_df, aes(x = expr, y = power)) + geom_point() + ggtitle(paste('R2=',R2)) + ylim(c(0,1)) + theme_classic()
my_order <- order(purk_expr)
pre_df <- as.data.frame(t(power_estimates[my_order,1,]))
pre_df$de <- de_vals
pre_df$ncell <- NUM_CELLS[1]
plot_df1 <- reshape2::melt(pre_df, id = c('de', 'ncell'))
pre_df <- as.data.frame(t(power_estimates[my_order,2,]))
pre_df$de <- de_vals
pre_df$ncell <- NUM_CELLS[2]
plot_df2 <- reshape2::melt(pre_df, id = c('de', 'ncell'))
pre_df <- as.data.frame(t(power_estimates[my_order,3,]))
pre_df$de <- de_vals
pre_df$ncell <- NUM_CELLS[3]
plot_df3 <- reshape2::melt(pre_df, id = c('de', 'ncell'))
plot_df <- rbind(plot_df1, plot_df2, plot_df3)
norm_purk_expr <- purk_expr - min(purk_expr)
norm_purk_expr <- norm_purk_expr / max(norm_purk_expr)
plot_df <- rbind(plot_df,
                 data.frame('de' = 0, 'ncell' = "Expression", 'variable' = high_genes,'value' = norm_purk_expr))
plot_df$ncell <- factor(plot_df$ncell, levels = c('Expression','250','500','1000'))
plot_df$label = round(plot_df$value, 2)
plot_df$label[plot_df$ncell == 'Expression'] <- NA
plot_df$de <- plot_df$de*log(exp(1),2)
my_labels <- names(table(plot_df$de))
p2 <- ggplot(plot_df[plot_df$ncell != 'Expression',], aes(y = variable, x = de, fill = value)) + facet_wrap(ncell ~., nrow = 1) + geom_raster() + theme_classic() +
  scale_fill_continuous("Power",high = "#132B43", low = "#56B1F7") + geom_text(aes(label = label), size = 2, color = 'white') + scale_x_continuous(breaks=round(as.numeric(my_labels),2)) + ylab("Gene") + xlab("Cell type A differential expression") + theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
p3 <- ggplot(plot_df[plot_df$ncell == 'Expression',], aes(y = variable, x = de, fill = value)) + facet_wrap(ncell ~., nrow = 1) + geom_raster() + theme_classic() + scale_x_continuous(breaks=round(as.numeric(my_labels),2)) +scale_fill_continuous("Log cell type A expression",high = pals::brewer.reds(20)[20], low = pals::brewer.reds(20)[2], breaks = c(0,1), labels =round(c(min(purk_expr), max(purk_expr))*log(exp(1),2),2)) + xlab("") + ylab("Gene")+ theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())
ggarrange(p2,p3,nrow = 1, widths = c(3.3,1))

p_my <- p_all[,2,,,]
ind <- 3 # DE
alpha <- 0.05
ncells <- 3
alpha_vals <- c((1:1000)/1000)
alpha_vals <- c(alpha_vals / 1000^3, alpha_vals / 1000^2, alpha_vals / 1000, alpha_vals)
all_res <- array(dim = c(2,3,4,length(alpha_vals))) # fpr/tpr ncells ind alpha_vals
for(k in 1:2) {
  for(ncells in 1:3) {
    for(ind in 1:4) {
      fpr <- numeric(length(alpha_vals))
      tpr <- numeric(length(alpha_vals))
      for(i in 1:length(alpha_vals)) {
        alpha <- alpha_vals[i]
        all_res[2,ncells,ind,i] <- mean(p_my[,,ncells,c(ind, 8 - ind)] < alpha)
        all_res[1,ncells,ind,i] <- mean(p_my[,,ncells,4] < alpha)
      }
    }
  }
}
plot_df <- melt(all_res[,,1:3,])
plot_df <- cbind(plot_df[(1:(dim(plot_df)[1]/2))*2,], plot_df[(1:(dim(plot_df)[1]/2))*2 - 1,])
plot_df <- plot_df[c(1,2,3,4,5,10)]
colnames(plot_df) <- c('type', 'cells', 'ind', 'aind', 'tpr', 'fpr')
#plot_df$ind <- factor(plot_df$ind)
plot_df$cells <- (2^plot_df$cells*125)
plot_df <- bind_rows(data.frame(type=c(2,2,2),cells=c(250,500,1000),ind=c(3,3,3),aind=c(3,3,3),tpr=c(0,0,0),fpr=c(0,0,0)), plot_df)
plot_df$ind <- factor(plot_df$ind)
plot_df$cells <- factor(plot_df$cells)
ggplot(plot_df, aes(x=fpr, y = tpr, color = ind)) + 
  geom_line() + facet_wrap(cells ~., nrow = 1) + theme_classic() + xlab('False positive rate') + ylab('True positive rate') + 
  scale_color_manual("True DE",values = c('#0072B2','#009E73','#D55E00'), labels = c(1.44,.96,.48))+ coord_fixed() + geom_abline(lty = 'dashed') + xlim(c(0,1)) + ylim(c(0,1))
#plot(fpr, tpr, ylim = c(0,1), main = ind)

dmcable/RCTD documentation built on Feb. 24, 2024, 11:03 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

dmcable/RCTD
SpatialeXpressionR: Cell type identification and cell type-specific differential expression in spatial transcriptomics

In dmcable/RCTD: SpatialeXpressionR: Cell type identification and cell type-specific differential expression in spatial transcriptomics

Benchmarking CSIDE on a simulated spatial transcriptomics dataset

Run RCTD on simulated data (the first step of CSIDE)

Load in RCTD results

Run SPARC on contamination simulation

Plot contamination simulation results

Run SPARK on cell type proportion simulation

Plot cell type proportion simulation results

Run CSIDE on null and non-null DE case

Plot null DE estimation results

Plot p-value enrichment

Plot standard error prediction

Run power analysis

Plot power analysis

R Package Documentation

Browse R Packages

We want your feedback!

dmcable/RCTD SpatialeXpressionR: Cell type identification and cell type-specific differential expression in spatial transcriptomics

In dmcable/RCTD: SpatialeXpressionR: Cell type identification and cell type-specific differential expression in spatial transcriptomics

Benchmarking CSIDE on a simulated spatial transcriptomics dataset

Run RCTD on simulated data (the first step of CSIDE)

Load in RCTD results

Run SPARC on contamination simulation

Plot contamination simulation results

Run SPARK on cell type proportion simulation

Plot cell type proportion simulation results

Run CSIDE on null and non-null DE case

Plot null DE estimation results

Plot p-value enrichment

Plot standard error prediction

Run power analysis

Plot power analysis

R Package Documentation

Browse R Packages

We want your feedback!

dmcable/RCTD
SpatialeXpressionR: Cell type identification and cell type-specific differential expression in spatial transcriptomics