knitr::opts_chunk$set(echo = T, warning = F, message = F, cache = T, cache.lazy = FALSE, results = 'hide')

CSIDE on the Slide-seq testes data

Pre-process testes data

library(spacexr)
library(Matrix)
library(devtools)
library(ggplot2)
library(ggpubr)
library(reshape2)
library(dplyr)
library(ggrepel)
source('~/Documents/MIT/Research/Rafalab/Projects/spacexr/AnalysisCSIDE/helper_functions/testes_helper.R')
source('~/Documents/MIT/Research/Rafalab/Projects/spacexr/AnalysisCSIDE/helper_functions/merge_de_helper.R')
devtools::load_all()
datadir <- '../../data/SpatialRNA/Testes'
puck <- readRDS(file.path(datadir,'puck.rds'))
### END PRELUDE
myRCTD <- readRDS(file.path(datadir,'myRCTD_testes.rds'))
myRCTD@originalSpatialRNA <- puck
codes_1 <- c()
stadir <- file.path(datadir,'Stage/I-III')
id <- 0
cvx_df <- data.frame(x=numeric(),y=numeric(),stage=numeric(),id = numeric())
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_1 <- c(codes_1,tubule_codes)
  print(length(codes_1))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 1,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

codes_2 <- c()
stadir <- file.path(datadir,'Stage/IV-VI')
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_2 <- c(codes_2,tubule_codes)
  print(length(codes_2))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 2,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

codes_3 <- c()
stadir <- file.path(datadir,'Stage/VII-VIII')
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_3 <- c(codes_3,tubule_codes)
  print(length(codes_3))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 3,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

codes_4 <- c()
stadir <- file.path(datadir,'Stage/IX-XII')
for(file in list.files(stadir)) {
  labels <- read.csv(file.path(stadir,file))
  tubule_codes <- as.character(labels$barcode)
  codes_4 <- c(codes_4,tubule_codes)
  print(length(codes_4))
  my_coords <- puck@coords[tubule_codes,1:2]
  my_ind <- chull(puck@coords[tubule_codes,])
  my_coords <- my_coords[my_ind,]
  temp_df <- cbind(my_coords, 4,id)
  colnames(temp_df)[3] <- 'stage'
  cvx_df <- bind_rows(cvx_df, temp_df)
  id <- id + 1
}

barcodes_1 <- intersect(codes_1,names(myRCTD@spatialRNA@nUMI))
barcodes_2 <- intersect(codes_2,names(myRCTD@spatialRNA@nUMI))
barcodes_3 <- intersect(codes_3,names(myRCTD@spatialRNA@nUMI))
barcodes_4 <- intersect(codes_4,names(myRCTD@spatialRNA@nUMI))
region_list <- list(barcodes_1, barcodes_2, barcodes_3, barcodes_4)
saveRDS(cvx_df, file.path(datadir, 'cvx_df.rds'))
saveRDS(region_list, file.path(datadir, 'region_list.rds'))
region_list <- readRDS(file.path(datadir, 'region_list.rds'))
cvx_df <- readRDS(file.path(datadir, 'cvx_df.rds'))
cvx_df$stage <- factor(cvx_df$stage)
cvx_df <- bind_rows(cvx_df, cvx_df[which(!duplicated(cvx_df$id)),])
all_barc <- intersect(Reduce(union,region_list), colnames(myRCTD@spatialRNA@counts))
aggregate_cell_types(myRCTD,all_barc,doublet_mode = F)
myRCTD@config$max_cores <- 4
cur_cell_types = c('1','2','4','5','6')
cell_types <- cur_cell_types

Run CSIDE

myRCTDde <- run.de.regions(myRCTD, region_list, datadir = datadir, cell_types = cur_cell_types,
                           doublet_mode = F)
myRCTDde@internal_vars_de$delta <- 0; myRCTDde@internal_vars_de$test_mode <- 'multi'
myRCTDde@reference <- gen_small_reference()
myRCTDde <- add_res_genes(myRCTDde, datadir = datadir, plot_genes = F)
saveRDS(myRCTDde, file.path(datadir,'myRCTDde_updated2.rds'))
make_all_de_plots(myRCTDde, datadir)
myRCTDde <- readRDS(file.path(datadir,'myRCTDde_updated2.rds'))
cell_type_mapping <- c('1'="ES", '2'="RS", '3'='Myoid', '4'="SPC", '5'="SPG",
  '6'="Sertoli", '7'="Leydig",'8'="Endothelial",'9'="Macrophage")
#extract relevant variables
X2 <- myRCTDde@internal_vars_de$X2
my_beta <- myRCTDde@internal_vars_de$my_beta
cell_types_present <- myRCTDde@internal_vars_de$cell_types_present
cur_cell_types <- myRCTDde@internal_vars_de$cell_types
all_barc <- myRCTDde@internal_vars_de$all_barc
n_regions <- dim(X2)[2]
gene_fits <- myRCTDde@de_results$gene_fits

Compare to Z-test

plot_df <- get_Z_test_res(c('1','2','4'), myRCTDde, X2, puck) 
saveRDS(plot_df, file.path(datadir, 'z_test_plot_df.rds'))
ct_df <- get_Z_test_res_ct('1', cur_cell_types, myRCTDde, X2,puck, fdr = .01) 
saveRDS(ct_df, file.path(datadir, 'ct_df.rds'))

Plot Z-test results

n_levels = myRCTD@cell_type_info$info[[3]]
my_pal = pals::kelly(n_levels+1)[2:(n_levels+1)]
names(my_pal) = myRCTD@cell_type_info$info[[2]]
my_pal_curr <- my_pal
my_pal_curr[cell_types[4]] <- "#CC79A7"
my_pal_curr[cell_types[5]] <- "#E69F00"
my_pal_curr[cell_types[3]] <- "#D55E00"
my_pal_curr[cell_types[1]] <- "#009E73"
my_pal_curr[cell_types[2]] <- "#0072B2"
plot_df <- readRDS(file.path(datadir, 'z_test_plot_df.rds'))
p1 <- ggplot(plot_df, aes(fill = method, x = cur_cell_types, y = count)) +
  geom_bar(position="dodge", stat="identity") + theme_classic() + ylab('Number of significant genes detected') + scale_fill_manual("Method", breaks = c('RCTD','Z'), labels = c('CSIDE','Z-test'), values = unname(my_pal_curr[c(4,2)])) + scale_x_discrete('Cell Type', breaks = c(1,2,4), labels = c('ES','RS','SPC'))
p2 <- ggplot(plot_df, aes(fill = method, x = cur_cell_types, y = n_total)) +
  geom_bar(position="dodge", stat="identity") + theme_classic()  + ylab('Number of pixels used for each cell type') + scale_fill_manual("Method", breaks = c('RCTD','Z'), labels = c('CSIDE','Z-test'), values = unname(my_pal_curr[c(4,2)])) + scale_x_discrete('Cell Type', breaks = c(1,2,4), labels = c('ES','RS','SPC'))
ggarrange(p1,p2, nrow = 2)

Plot correlation of overlap

ct_df <- readRDS(file.path(datadir, 'ct_df.rds'))
both_genes <- intersect(rownames(myRCTDde@de_results$res_gene_list[['1']]), rownames(ct_df))
print(cor(gene_fits$all_vals[both_genes,1,1] - gene_fits$all_vals[both_genes,2,1], 
     ct_df[both_genes, 'log_fc_12'])^2)
plot(gene_fits$all_vals[both_genes,1,1] - gene_fits$all_vals[both_genes,2,1], 
     ct_df[both_genes, 'log_fc_12'])

Plot correlation across cell types

cell_type_1 <- '2'; cell_type_2 <- '4'
cell_type_1 <- '1'; cell_type_2 <- '4'
cell_type_1 <- '1'; cell_type_2 <- '2'
cor_res <- cor_ct_patterns(cell_type_1, cell_type_2, myRCTDde, cell_types_present, X2, gene_fits, cur_cell_types)
hist(cor_res$res_n)
hist(cor_res$res_corr[cor_res$res_corr > -1])
cor_mat <- cor_ct_patterns_quant(cell_type_1, cell_type_2, myRCTDde, cell_types_present, X2, gene_fits, cur_cell_types)
corrplot::corrplot(cor_mat)

Detect cell type and stage specific marker genes

all_ct <- c('1','2','4')
info_1 <- get_marker_by_region(all_ct, '1', n_regions, myRCTDde, cell_types_present, gene_fits, trials = 50000, p_val_thresh = .001)
saveRDS(info_1, file.path(datadir, 'info_1.rds'))
info_1 <- readRDS(file.path(datadir, 'info_1.rds'))
info_2 <- get_marker_by_region(all_ct, '2', n_regions, myRCTDde, cell_types_present, gene_fits, trials = 50000, p_val_thresh = .001)
saveRDS(info_2, file.path(datadir, 'info_2.rds'))
info_2 <- readRDS(file.path(datadir, 'info_2.rds'))
info_4 <- get_marker_by_region(all_ct, '4', n_regions, myRCTDde, cell_types_present, gene_fits, trials = 50000, p_val_thresh = .001)
saveRDS(info_4, file.path(datadir, 'info_4.rds'))
info_4 <- readRDS(file.path(datadir, 'info_4.rds'))
table(info_1$first_region)
table(info_2$first_region)
table(info_4$first_region)
info_1[order(info_1$first_region),]
infodf <- info_1[info_1$first_region == 4,]
infodf[order(-infodf$diff),]

Plot marker genes

region <- 4
info_1 <- readRDS(file.path(datadir, 'info_1.rds'))
gene_list <- rownames(info_1)[info_1$first_region == region]
final_gene_list <- gene_list
fit_df <- cbind(gene_fits$all_vals[final_gene_list,,1], gene_fits$all_vals[final_gene_list,,2], gene_fits$all_vals[final_gene_list,,3])
fit_df <- exp(fit_df)
norm_df <- sweep(fit_df, 1, apply(fit_df,1,max),'/')
plot_df <- reshape2::melt(norm_df)
colnames(plot_df) <- c('gene', 'region_id', 'expr')
plot_df$region <- ((plot_df$region_id - 1) %% n_regions)+ 1
plot_df$cell_type <- cur_cell_types[floor((plot_df$region_id - 1) / n_regions) + 1]
cur_range = c(0,1)
p <- ggplot(plot_df, aes(region_id, gene, fill = expr)) + geom_tile() +
  scale_fill_gradientn(colors = pals::brewer.blues(20)[2:20],name = "Normalized Expression", labels = c(0,1),breaks = c(0,1))
p

Plot Tnp1

library(ggnewscale)
cell_type <- '1'; region <- 4
gene_thresh <- 7.5; gene <- 'Tnp1'
make_ct_region_plot(cur_cell_types, cell_type, region, my_beta, X2, gene_thresh, puck, gene, sing_thresh = 0.8) + new_scale_color()+
  geom_path(data = cvx_df,aes(x=x,y=y, group = id, color = stage), linetype = 'dashed') + scale_color_manual('Stage', values = c('#E69F00','#CC79A7', '#EFCB00',"#000000"), labels = c('I-III','IV-VI', 'VII-VIII', 'IX-XII'))+ geom_segment(aes(x = 1300, y = 1000, xend = 1684.6, yend = 1000), color = "black")+labs(color='') +  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank()) + coord_fixed()

Plot metagene signature

region <- 4
gene_list <- rownames(info_1)[info_1$first_region == region]
cell_type <- '1'; region <- 4
gene_thresh <- 45; gene <- 'Tnp1'
make_ct_region_plot(cur_cell_types, cell_type, region, my_beta, X2, gene_thresh, puck, gene_list, sing_thresh = 0.8) + new_scale_color()+
  geom_path(data = cvx_df,aes(x=x,y=y, group = id, color = stage), linetype = 'dashed') + scale_color_manual('Stage', values = c('#E69F00','#CC79A7', '#EFCB00',"#000000"), labels = c('I-III','IV-VI', 'VII-VIII', 'IX-XII'))+ geom_segment(aes(x = 1300, y = 1000, xend = 1684.6, yend = 1000), color = "black")+labs(color='') +  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(), axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank()) + coord_fixed()

Find and plot cyclic genes

gene_list_final <- c()
region_list <- c()
ct_list <- c()
mean_list <- matrix(0, 12,4)
sd_list <- matrix(0,12,4)
cur_ind <- 1
for(ct_ind in 1:3) {
  Imat_ind <- (ct_ind-1)*4 + (1:4)
  sig_gene_list <- rownames(myRCTDde@de_results$res_gene_list[[ct_ind]])
  mean_mat <- gene_fits$all_vals[sig_gene_list,,ct_ind]
  sd_mat <- gene_fits$I_mat[sig_gene_list, Imat_ind]
  cur_gene_list <- names(which(rowSums(sd_mat < 0.2) == 4))
  cur_gene_list <- names(which(apply(mean_mat[cur_gene_list,],1, is.maxcyclic)))
  maxcyc_df <- cbind(apply(mean_mat[cur_gene_list,],1,which.max), 
        apply(mean_mat[cur_gene_list,],1,breathing.room))
  for(region in unique(maxcyc_df[,1])) {
    match_list <- which(maxcyc_df[,1] == region)
    reg_df <- maxcyc_df[match_list,]
    if(length(match_list) > 1)
      cur_gene <- names(which.max(reg_df[,2]))
    else
      cur_gene <- rownames(maxcyc_df)[which(maxcyc_df[,1] == region)]
    gene_list_final <- c(gene_list_final,cur_gene)
    region_list <- c(region_list,region)
    ct_list <- c(ct_list, ct_ind)
    mean_list[cur_ind,] <- mean_mat[cur_gene,]
    sd_list[cur_ind,] <- sd_mat[cur_gene,]
    cur_ind <- cur_ind + 1
  }
}
plot_df <- data.frame(gene_list_final,region_list,ct_list)
mean_list <- mean_list[1:(cur_ind-1),]
sd_list <- sd_list[1:(cur_ind-1),]
df1 <- cbind(plot_df, 1, mean_list[,1], sd_list[,1])
plot_df <- rbind(setNames(df1,colnames(df1)),
                 setNames(cbind(plot_df, 2, mean_list[,2], sd_list[,2]),colnames(df1)),
                 setNames(cbind(plot_df, 3, mean_list[,3], sd_list[,3]),colnames(df1)),
                 setNames(cbind(plot_df, 4, mean_list[,4], sd_list[,4]),colnames(df1)))
colnames(plot_df) <- c('gene','regionmax','cell_type','region','mean','sd')
plot_df$regionmax <- factor(plot_df$regionmax)
plot_df$cell_type <- factor(plot_df$cell_type)
library(ggrepel)
plot_df$label <- plot_df$gene
plot_df$label[plot_df$region > 1] <- NA
plot_df$region <- factor(plyr::mapvalues(plot_df$region, from = c(1:4), to= c('I-III','IV-VI','VII-VIII','IX-XII')), levels = c('I-III','IV-VI','VII-VIII','IX-XII'))
plot_df$regionmax <- factor(plyr::mapvalues(plot_df$regionmax, from = c(1:4), to= c('I-III','IV-VI','VII-VIII','IX-XII')), levels = c('I-III','IV-VI','VII-VIII','IX-XII'))
plot_df[,c('mean','sd')] <- plot_df[,c('mean','sd')]*log(exp(1),2) # scale to log 2
plot_df$mean <- plot_df$mean + log(500,2)#  counts per 500
ggplot(plot_df, aes(x = region, y = mean, group = gene, color = cell_type)) + geom_line() + geom_point(size = 3) + theme_classic()  +
  geom_label_repel(aes(label = label),nudge_x = -0.05,na.rm = TRUE, show.legend = F) + facet_wrap(regionmax ~.) +
  geom_errorbar(aes(ymin = mean - 1.96*sd, ymax = mean+1.96*sd, width = 0.1)) + scale_color_manual('Cell type', breaks = c(1,2,3),values = unname(my_pal_curr[c(1,2,4)]), labels = c('ES','RS','SPC')) + xlab('Stage') + ylab('Log estimated expression')

Aggregate max-cyclic genes

results_mat <- matrix(0,3,9)

for(ct_ind in 1:3) {
  Imat_ind <- (ct_ind-1)*4 + (1:4)
  sig_gene_list <- rownames(myRCTDde@de_results$res_gene_list[[ct_ind]])
  mean_mat <- gene_fits$all_vals[sig_gene_list,,ct_ind]
  cyclic <- table(apply(mean_mat, 1, is.cyclic))
  percent_cyclic <- cyclic[2] / sum(cyclic)
  maxcyclic <- table(apply(mean_mat, 1, is.maxcyclic.thresh)) 
  percent_maxcyclic <- maxcyclic[2] / sum(maxcyclic)
  random_chance<-  mean(apply(mean_mat, 1, maxcyclic.thresh.prob))
  norm_mean_mat <- t(apply(mean_mat, 1, norm_vec))
  results_vec <- c(D_thresh, percent_cyclic, 3/4, percent_maxcyclic, random_chance, colMeans(norm_mean_mat))
  results_mat[ct_ind, ] <- results_vec
}
colnames(results_mat) <- c('D_thresh', 'cyclic', 'randomc', 'cyclicmax', 'randomm', 'pos1','pos2','pos3','pos4')
results_mat <- as.data.frame(results_mat)
results_mat$ct_ind <- 1:3
plot_df <- melt(results_mat[,c(2:5,10)], id = 'ct_ind')
ggplot(plot_df[plot_df$variable %in% c('cyclicmax','randomm'),],aes(x=factor(ct_ind), y =value, fill = factor(variable))) +
  geom_bar(stat = 'identity', position = 'dodge') + ylim(c(0,1)) + theme_classic()+ ylab('Cyclic genes as a proportion of significant genes') + scale_fill_manual("", breaks = c('cyclicmax','randomm'), labels = c('CSIDE Estimates','Shuffled'), values = unname(my_pal_curr[c(4,2)])) + scale_x_discrete('Cell Type', breaks = c(1,2,3), labels = c('ES','RS','SPC'))
barcodes <- myRCTDde@internal_vars_de$all_barc
O <- myRCTDde@internal_vars_de$my_beta > 0.25
S <- myRCTDde@internal_vars_de$my_beta[,1] + myRCTDde@internal_vars_de$my_beta[,2] > 0.25
vec <- c(sum(S & O[,3] & myRCTDde@internal_vars_de$X2[all_barc,1]),
  sum(!S & O[,3]& myRCTDde@internal_vars_de$X2[all_barc,1]),
  sum(S & O[,3] & myRCTDde@internal_vars_de$X2[all_barc,2]),
  sum(!S & O[,3]& myRCTDde@internal_vars_de$X2[all_barc,2]),
  sum(S & O[,3] & myRCTDde@internal_vars_de$X2[all_barc,3]),
  sum(!S & O[,3]& myRCTDde@internal_vars_de$X2[all_barc,3]),
  sum(S & O[,3] & myRCTDde@internal_vars_de$X2[all_barc,4]),
  sum(!S & O[,3]& myRCTDde@internal_vars_de$X2[all_barc,4]))
plot_df <- data.frame(cbind(vec, c(T,F,T,F,T,F,T,F), c(1,1,2,2,3,3,4,4)))
colnames(plot_df) <- c('n','S','region')
ggplot(plot_df,aes(x=factor(region), y =n, fill = factor(S))) +
  geom_bar(stat = 'identity', position = 'dodge') + theme_classic() + ylab('Number of pixels') + xlab('Region') + scale_fill_manual('', breaks=c(0,1), labels=c('ES/RS Absent','ES/RS Present'), values = unname(my_pal_curr[c(4,2)]))


dmcable/RCTD documentation built on Feb. 24, 2024, 11:03 p.m.