get_n_sig_genes_Z_test <- function(cell_type, cur_cell_types, myRCTDde, X2,puck, fdr = .01) {
final_df <- get_Z_test_res_ct(cell_type, cur_cell_types, myRCTDde, X2,puck, fdr = fdr)
return(dim(final_df)[1])
}
get_Z_test_res_ct <- function(cell_type, cur_cell_types, myRCTDde, X2,puck, fdr = .01, log_fc_thresh = 0.4, p_thresh = 1) {
cell_type_ind <- which(cur_cell_types == cell_type)
n_cell_types <- length(cur_cell_types)
sing_barcodes <- names(which(myRCTDde@internal_vars_de$my_beta[,cell_type_ind] > 0.9))
if(length(sing_barcodes) < 4)
return(0)
region_ids <- unlist(lapply(sing_barcodes,
function(barcode) which(as.logical(X2[barcode,]))))
names(region_ids) <- sing_barcodes
gene_list_type <- get_gene_list_type_wrapper(myRCTDde, cell_type, cell_types_present)
p_val_list <- numeric(length(gene_list_type)); names(p_val_list) <- gene_list_type
log_fc_list <- numeric(length(gene_list_type)); names(log_fc_list) <- gene_list_type
log_fc_12_list <- numeric(length(gene_list_type)); names(log_fc_12_list) <- gene_list_type
n_regions_con <- length(table(region_ids))
if(n_regions_con < 2)
return(0)
region_id_names <- names(table(region_ids))
for(gene in gene_list_type) {
Y <- puck@counts[gene,sing_barcodes] / puck@nUMI[sing_barcodes]
obs_df <- data.frame(unlist(lapply(region_id_names, function(region) mean(Y[sing_barcodes[region_ids == region]]))),
unlist(lapply(region_id_names, function(region) length(Y[sing_barcodes[region_ids == region]]))),
unlist(lapply(region_id_names, function(region) sd(Y[sing_barcodes[region_ids == region]]))))
colnames(obs_df) <- c('mean','n','sd')
obs_df$se <- obs_df$sd / sqrt(obs_df$n)
ovr_best_p_val <- 1
best_log_fc <- 0
best_sd <- 0
for(i1 in 1:(n_regions_con-1))
for(i2 in (i1+1):n_regions_con) {
if(obs_df$mean[i1] > 0 & obs_df$mean[i2] > 0) {
log_fc_lin <- abs(obs_df$mean[i1] - obs_df$mean[i2])
sd_cur <- sqrt(obs_df$se[i1]^2 + obs_df$se[i2]^2)
z_score <- log_fc_lin / sd_cur
log_fc <- abs(log(obs_df$mean[i1]+1e-20) - log(obs_df$mean[i2]+1e-20))
if(i1 == 1 && i2 == 2)
log_fc_12_list[gene] <- log(obs_df$mean[i1]+1e-20) - log(obs_df$mean[i2]+1e-20)
p_val <- 2*(1-pnorm(z_score)) * choose(n_regions_con, 2)
if(p_val < min(ovr_best_p_val, p_thresh)) {
ovr_best_p_val <- p_val
best_log_fc <- log_fc
best_sd <- sd_cur
}
}
}
p_val_list[gene] <- min(1, ovr_best_p_val)
log_fc_list[gene] <- best_log_fc
}
gene_list_sig <- fdr_sig_genes(gene_list_type, p_val_list, fdr)
gene_list_sig <- gene_list_sig[log_fc_list[gene_list_sig] > log_fc_thresh]
final_df <- cbind(p_val_list[gene_list_sig], log_fc_list[gene_list_sig], log_fc_12_list[gene_list_sig])
rownames(final_df) <- gene_list_sig
colnames(final_df) <- c('p', 'log_fc', 'log_fc_12')
return(final_df)
}
cor_ct_patterns <- function(cell_type_1, cell_type_2, myRCTDde, cell_types_present, X2, gene_fits, cur_cell_types) {
n_cell_types <- length(cur_cell_types)
n_regions <- dim(X2)[2]
same_genes <- intersect(get_gene_list_type_wrapper(myRCTDde, cell_type_1, cell_types_present),
get_gene_list_type_wrapper(myRCTDde, cell_type_2, cell_types_present))
cell_ind_1 <- which(cur_cell_types == cell_type_1); cell_ind_2 <- which(cur_cell_types == cell_type_2)
I_mat_ind_1 <- (1:n_regions) + (n_regions*(cell_ind_1 - 1))
I_mat_ind_2 <- (1:n_regions) + (n_regions*(cell_ind_2 - 1))
res_corr <- numeric(length(same_genes))
names(res_corr) <- same_genes
res_n <- numeric(length(same_genes))
names(res_n) <- same_genes
p_val_thresh <- 0.001
for(gene in same_genes) {
con_regions <- get_con_regions(gene_fits, gene, n_regions, cell_ind_1, n_cell_types) &
get_con_regions(gene_fits, gene, n_regions, cell_ind_2, n_cell_types)
n_regions_con <- sum(con_regions)
x1 <- gene_fits$all_vals[gene, con_regions,cell_ind_1]
x2 <- gene_fits$all_vals[gene, con_regions,cell_ind_2]
I_mat_ind_cur_1 <- I_mat_ind_1[con_regions]; I_mat_ind_cur_2 <- I_mat_ind_2[con_regions]
var_vals_1 <- (gene_fits$I_mat[gene, I_mat_ind_cur_1])^2
var_vals_2 <- (gene_fits$I_mat[gene, I_mat_ind_cur_2])^2
ovr_best_p_val <- 1
best_log_fc <- 0
best_sd <- 0
trials <- 0;
success <- 0;
for(i1 in 1:(n_regions_con-1))
for(i2 in (i1+1):n_regions_con) {
log_fc_1 <- x1[i1] - x1[i2]
sd_cur_1 <- sqrt(var_vals_1[i1] + var_vals_1[i2])
z_score_1 <- abs(log_fc_1) / sd_cur_1
p_val_1 <- 2*(1-pnorm(z_score_1))
log_fc_2 <- x2[i1] - x2[i2]
sd_cur_2 <- sqrt(var_vals_2[i1] + var_vals_2[i2])
z_score_2 <- abs(log_fc_2) / sd_cur_2
p_val_2 <- 2*(1-pnorm(z_score_2))
if(max(p_val_1, p_val_2) < p_val_thresh) {
trials <- trials + 1
if(sign(log_fc_1) == sign(log_fc_2))
success <- success + 1
}
}
if(trials > 0) {
res_corr[gene] <- success / trials
} else {
res_corr[gene] <- -1
}
res_n[gene] <- trials
}
return(list(res_n = res_n, res_corr = res_corr))
}
cor_ct_patterns_quant <- function(cell_type_1, cell_type_2, myRCTDde, cell_types_present, X2, gene_fits, cur_cell_types) {
n_regions <- dim(X2)[2]
same_genes <- intersect(get_gene_list_type_wrapper(myRCTDde, cell_type_1, cell_types_present),
get_gene_list_type_wrapper(myRCTDde, cell_type_2, cell_types_present))
cell_ind_1 <- which(cur_cell_types == cell_type_1); cell_ind_2 <- which(cur_cell_types == cell_type_2)
I_mat_ind_1 <- (1:n_regions) + (n_regions*(cell_ind_1 - 1))
I_mat_ind_2 <- (1:n_regions) + (n_regions*(cell_ind_2 - 1))
cor_results <- matrix(0,nrow = n_regions, ncol = n_regions)
for(region_1 in 1:(n_regions-1))
for(region_2 in (region_1+1):n_regions) {
con_genes <- names(which(
gene_fits$con_all[same_genes,I_mat_ind_1[region_1]] & gene_fits$con_all[same_genes,I_mat_ind_2[region_1]]
& gene_fits$con_all[same_genes,I_mat_ind_1[region_2]] & gene_fits$con_all[same_genes,I_mat_ind_2[region_2]]))
diff_1 <- gene_fits$all_vals[same_genes,region_1,cell_ind_1] - gene_fits$all_vals[same_genes,region_2,cell_ind_1]
diff_2 <- gene_fits$all_vals[same_genes,region_1,cell_ind_2] - gene_fits$all_vals[same_genes,region_2,cell_ind_2]
cor_results[region_1,region_2] <- cor(diff_1,diff_2)
}
return(cor_results)
}
get_marker_by_region <- function(all_ct, cell_type, n_regions, myRCTDde, cell_types_present, gene_fits, p_val_thresh = .001, trials = 5000, log_fc_thresh = 0.4) {
other_cell_types <- all_ct[all_ct != cell_type]
cell_ind <- which(cell_type == cur_cell_types)
I_mat_ind <- (1:n_regions) + (n_regions*(cell_ind - 1))
# for cell type 1, log_fc_thresh = 0.4 -> marker genes for all regions. For ct 2, marker genes only for region 4
# for cell type 2, log_fc_thresh = 0.4 -> marker genes for all regions
# for cell type 4, log_fc_thresh = 0.4 -> marker genes for regions 1, 3, and 4. If lower p val thresh to 0.01, then we we get all regions
# if we use our alternative monte carlo p value test, we get significant for all cases
# for cell type 4, best genes are (Aurka, Syt11, Piwil1, Smg9)
ct_genes <- get_gene_list_type_wrapper(myRCTDde, cell_type, cell_types_present)
marker_ratio <- log(myRCTDde@cell_type_info$info[[1]][ct_genes,cell_type] /
apply(myRCTDde@cell_type_info$info[[1]][ct_genes,other_cell_types] ,1,max))
marker_genes <- names(which(marker_ratio > log_fc_thresh))
Z_thresh_vals <- unlist(lapply(marker_genes, function(gene) {
print(gene)
z <- matrix(0, nrow = trials, ncol = n_regions)
z[] <- rnorm(n_regions*trials)
sd_vals <- diag(gene_fits$I_val[[which(rownames(gene_fits$I_mat) == gene)]][I_mat_ind,I_mat_ind])
z <- sweep(z, 2, sd_vals, '*')
diff_sim <- apply(z,1,max) - apply(z,1,function(x) x[order(-x)][2])
first_region_sim <- apply(z,1,which.max)
second_region_sim <- apply(z,1,function(x) order(-x)[2])
Z_sim <- diff_sim / sqrt(sd_vals[first_region_sim]^2 + sd_vals[second_region_sim]^2)
Z_thresh <- Z_sim[order(-Z_sim)[round(trials * p_val_thresh)]]
return(Z_thresh)
}))
names(Z_thresh_vals) <- marker_genes
#table(gene_fits$con_all[marker_genes,I_mat_ind]) #all converged for all cell types wooo!!!
diff <- apply(gene_fits$all_vals[marker_genes, , cell_ind],1,max) -
apply(gene_fits$all_vals[marker_genes, , cell_ind],1,function(x) x[order(-x)][2])
first_region <- apply(gene_fits$all_vals[marker_genes, , cell_ind],1,which.max)
second_region <- apply(gene_fits$all_vals[marker_genes, , cell_ind],1,function(x) order(-x)[2])
Z_scores <- unlist(lapply(marker_genes, function(gene) diff[gene] / sqrt(sum(gene_fits$I_mat[gene,I_mat_ind[c(first_region[gene],second_region[gene])]]^2))))
p_vals <- 2*(1-pnorm(Z_scores))
#sig_genes <- names(which(p_vals < p_thresh & diff > log_fc_thresh))
sig_genes <- names(which(Z_scores > Z_thresh_vals & diff > log_fc_thresh))
table(first_region[sig_genes])
hist(p_vals)
info_df <- data.frame(sig_genes, first_region[sig_genes], Z_scores[sig_genes], Z_thresh_vals[sig_genes],
p_vals[sig_genes], diff[sig_genes], second_region[sig_genes], marker_ratio[sig_genes],
log(myRCTDde@cell_type_info$info[[1]][sig_genes,cell_type]))
colnames(info_df) <- c('gene','first_region','Z', 'Z_thresh','p','diff','second_region','marker_ratio', 'expr')
return(info_df)
}
make_ct_region_plot <- function(cur_cell_types, cell_type, region, my_beta, X2, gene_thresh, puck, gene, sing_thresh = 0.8) {
ct_ind <- which(cur_cell_types == cell_type)
prop_list <- my_beta[intersect(rownames(my_beta),names(which(X2[,region] == 1))),ct_ind]
ct_reg_barc <- names(which(prop_list > sing_thresh))
MULT <- 500
if(length(gene) > 1)
Y_plot <- MULT*colSums(puck@counts[gene,])/puck@nUMI
else
Y_plot <- MULT*puck@counts[gene,]/puck@nUMI
my_title = paste(gene, gene_thresh)
barc_plot <- c(names(which(prop_list > sing_thresh | prop_list < 0.2)),
rownames(my_beta[intersect(rownames(my_beta),names(which(X2[,region] == 0))),]))# exclude the middle ones
my_class <- rep(0,length(barc_plot)); names(my_class) <- barc_plot
my_class[!(barc_plot %in% ct_reg_barc) & (Y_plot[barc_plot] <= gene_thresh)] <- 1
my_class[!(barc_plot %in% ct_reg_barc) & (Y_plot[barc_plot] > gene_thresh)] <- 3
my_class[(barc_plot %in% ct_reg_barc) & (Y_plot[barc_plot] <= gene_thresh)] <- 2
my_class[(barc_plot %in% ct_reg_barc) & (Y_plot[barc_plot] > gene_thresh)] <- 4
p3 <- plot_class(puck, barc_plot[order(my_class[barc_plot])], factor(my_class), title = my_title)
suppressMessages(p3 <- p3 + scale_color_manual(values=c("#CCE2EF","#F6DECC","#0072B2","#D55E00")))
return(p3)
}
get_Z_test_res <- function(cur_cell_types, myRCTDde, X2, puck) {
Z_test_results <- unlist(lapply(cur_cell_types,
function(cell_type) get_n_sig_genes_Z_test(cell_type, cur_cell_types, myRCTDde, X2,puck, fdr = .01)))
names(Z_test_results) <- cur_cell_types
#saveRDS(Z_test_results, file = file.path(datadir,'z_test.rds'))
RCTDde_test <- unlist(lapply(cur_cell_types,
function(cell_type) length(rownames(myRCTDde@de_results$res_gene_list[[cell_type]]))))
names(RCTDde_test) <- cur_cell_types
n_singlets <- unlist(lapply(1:length(cur_cell_types),
function(cell_type_ind) sum(myRCTDde@internal_vars_de$my_beta[,cell_type_ind] > 0.9)))
names(n_singlets) <- cur_cell_types
n_total <- colSums(myRCTDde@internal_vars_de$my_beta[,cur_cell_types])
df1 <- data.frame(cur_cell_types, n_singlets, n_total, Z_test_results,'Z')
df2 <- data.frame(cur_cell_types, n_singlets, n_total, RCTDde_test, 'RCTD')
colnames(df1)[4:5] <- c('count','method'); colnames(df2)[4:5] <- c('count','method')
plot_df <- rbind(df1,df2)
plot_df[plot_df$method == 'Z','n_total'] <- plot_df[plot_df$method == 'Z','n_singlets']
return(plot_df)
}
is.maxcyclic.thresh.large <- function(x, D_thresh = -0.25) {
my_ind <- (which.max(x) + 1) %% 4 + 1
return(x[my_ind] - x[order(x)][2] <= D_thresh)
}
breathing.room <- function(x) {
x <- x[order(x)]
return(min(x[2] - x[1], x[4] - x[3]))
}
shifter <- function(x, n = 1) {
if (n == 0) x else c(tail(x, -n), head(x, n))
}
norm_vec <- function(x) {
x <- x - min(x)
x <- x / max(x)
max_ind <- which.max(x)
return(shifter(x, max_ind - 2))
}
is.cyclic <- function(x) {
if(x[2] < x[1]) {
return(!(x[3] > x[2] & x[4] < x[3]))
} else {
return(!(x[3] < x[2] & x[4] > x[3]))
}
}
is.maxcyclic <- function(x) {
return(abs(which.min(x) - which.max(x)) == 2)
}
D_thresh <- 0.25
is.maxcyclic.thresh <- function(x) {
my_ind <- (which.max(x) + 1) %% 4 + 1
return(x[my_ind] - min(x) <= D_thresh)
}
maxcyclic.thresh.prob <- function(x) {
return(min(sum(x - min(x) < D_thresh),3)/3)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.