library(conos) library(pheatmap) #library(fuck) devtools::load_all('/home/larsc/SecretUtils') require(pagoda2) library(dplyr) library(stringr) library(tidyr) library(ggplot2) library(cowplot) con_object <- readRDS(file.path('/home/larsc/data/10x_preproced_graphed.rds')) annot <- readRDS(file.path('/home/demharters/R/projects/UPF9_14_17_19_22_23_24_32_33/metadata_10x_final.rds'))
rowbind the adjusted expression values
if (is.null(annot$cellid)) { annot$cellid <- annot %>% rownames } annot <- annot %>% mutate(subtype_condition = paste(annot$subtype, annot$condition, sep='_')) rbound_panel <- RbindPanel(con_object)
Are the cell names in the same order?
identical(rownames(rbound_panel), rownames(annot)) # thank god, maybe add a sort for the future in Panelize
Inter sample distance, aggregated
od_genes = conos:::getOdGenesUniformly(con_object$samples, 3000) # split annot in categories (disease, healthy) state_split <- split(annot, annot$condition, drop=TRUE) # get probability distributions for subtype/sample categories all_probs <- ObtainProbabilities(state_split, rbound_panel, 1, 2, 4, od_genes, pseudo.count=10^(-6)) # now get JS distances all_distances <- Map(CalculateAllJSD, all_probs$healthy, all_probs$epilepsy) #all_dists_dfd <- all_distances %>% lapply(function(x){data.frame(comparison=names(x), value=x, row.names=NULL)}) # use stackexchange magic to pad NAs to shorter distance vectors, and also tibbling all_dists <- lapply(all_distances, `length<-`, max(lengths(all_distances))) %>% as_tibble all_dists <- all_dists %>% mutate(comparison=names(all_distances$Id2_Reln)) # gather into format for easy ggplot all_dists_gathered <- gather(all_dists, key=subtype, value=js_distance, -comparison) # plot ggplot(all_dists_gathered, aes(y=js_distance, x=subtype)) + geom_violin()+geom_point(aes(col=comparison), alpha = 0.6, size=1.8) + theme(axis.text.x = element_text(angle = -90, hjust = 1))
What about within condition distances (jensen shannon)
getUpperTri <- function(dist.numeric){ dist.vec <- as.vector(dist.numeric) dist.mat <- dist.vec %>% matrix(nrow=sqrt(dist.vec %>% length)) name.mat <- names(dist.numeric) %>% matrix(nrow=sqrt(dist.vec %>% length)) comp.names <- name.mat[upper.tri(name.mat)] vals <- dist.mat[upper.tri(dist.mat)] names(vals) <- comp.names return(vals) } Dataframize <- function(vector.name, a.list){ named.vector <- a.list[[vector.name]] init.df <- bind_cols(value=named.vector, comparison=names(named.vector)) full.df <- init.df %>% mutate(subtype=vector.name) return(full.df) } InterSampleDists <- function(cond.sub.samples){ inter.dists <- Map(CalculateAllJSD, cond.sub.samples, cond.sub.samples) inter.dists <- inter.dists %>% lapply(getUpperTri) inter.dists <- do.call(rbind, inter.dists %>% names %>% lapply(Dataframize, inter.dists)) return(inter.dists) } Dataframize2 <- function(between.vector.name, list.of.vectors){ between.vector <- list.of.vectors[[between.vector.name]] comparison <- between.vector %>% names df <- (bind_cols(value=between.vector, comparison=comparison)) df$subtype <- between.vector.name return(df) } all_probs <- ObtainProbabilities(state_split, rbound_panel, 1, 2, 4, od_genes, pseudo.count=10^(-6)) eps_distances <- Map(CalculateAllJSD, all_probs$epilepsy, all_probs$epilepsy) health_distances <- Map(CalculateAllJSD, all_probs$healthy, all_probs$healthy) between_distances <- Map(CalculateAllJSD, all_probs$healthy, all_probs$epilepsy) # if we want to try viktors JS plot we replace the rbound panel with neighbour smoothed # and we replace sample probabilities with individual cell probabilities eps_dist2 <- eps_distances %>% lapply(getUpperTri) health_dist2 <- health_distances %>% lapply(getUpperTri) between_dist2 <- names(between_distances) %>% lapply(Dataframize2, between_distances) %>% bind_rows eps_dist3 <- do.call(rbind, eps_dist2 %>% names %>% lapply(Dataframize, eps_dist2)) health_dist3 <- do.call(rbind, health_dist2 %>% names %>% lapply(Dataframize, health_dist2)) combined_dists <- rbind(eps_dist3, health_dist3) %>% mutate(condition = c(rep('epilepsy', 60), rep('healthy', 114))) combined_dists2 <- rbind(combined_dists, mutate(between_dist2, condition='between')) combined_dists2 %>% ggplot(aes(x=subtype, y=value, col=condition))+geom_boxplot()+ theme(axis.text.x = element_text(angle = -90, hjust = 1))
Overall JS, all sample-conds aggregated let's see overall JS dist.
od_genes = conos:::getOdGenesUniformly(con_object$samples, 1000) state_split <- split(annot, annot$condition, drop=TRUE) subtype_split <- state_split %>% lapply(function(x){split(x, x$subtype, drop=TRUE)}) healthy_probs <- subtype_split$healthy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10)) epilepsy_probs <- subtype_split$epilepsy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10)) all_dists <- Map(JensenShannon, healthy_probs, epilepsy_probs) %>% as_tibble all_dists_gathered <- gather(all_dists, key=subtype, value=js_distance) ggplot(all_dists_gathered, aes(y=js_distance, x=subtype)) +geom_bar(stat='identity', col='hotpink') + theme(axis.text.x = element_text(angle = -90, hjust = 1))
JS distance for each subtype (i.e. all samples aggregated) as function of the minimum (or max) number of cells
od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000) state_split <- split(annot, annot$condition, drop=TRUE) condition_tables <- state_split %>% lapply(function(x){table(x$subtype)}) condition_df <- condition_tables %>% as.data.frame condition_df <- condition_df %>% mutate(min_cells = mapply(min, epilepsy.Freq, healthy.Freq)) condition_df <- condition_df %>% mutate(max_cells = mapply(max, epilepsy.Freq, healthy.Freq)) subtype_split <- state_split %>% lapply(function(x){split(x, x$subtype, drop=TRUE)}) healthy_probs <- subtype_split$healthy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10)) epilepsy_probs <- subtype_split$epilepsy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10)) all_dists <- Map(JensenShannon, healthy_probs, epilepsy_probs) %>% as_tibble all_dists_gathered <- gather(all_dists, key=subtype, value=js_distance) %>% mutate(min_cells = condition_df$min_cells, max_cells = condition_df$max_cells) ggplot(all_dists_gathered, aes(y=js_distance, x=min_cells, col=subtype)) +geom_point() + theme(axis.text.x = element_text(angle = -90, hjust = 1)) ggplot(all_dists_gathered, aes(y=js_distance, x=max_cells, col=subtype)) +geom_point() + theme(axis.text.x = element_text(angle = -90, hjust = 1))
Dependency for single subtypes
od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000) nr_cells <- c(30, 60, seq(100, 1000, by=50)) nr_cells2 <- (seq(30,300,by=15)) do_dist_cells <- function(nr.cell, subtype, pseudo.prob=1e-8, dist=js) { healthy_probs <- subtype_split$healthy[[subtype]] %>% SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=nr.cell, pseudo.count=pseudo.prob) epilepsy_probs <- subtype_split$epilepsy[[subtype]] %>% SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=nr.cell, pseudo.count=pseudo.prob) if (dist=='js') { js.dist <- JensenShannon( healthy_probs, epilepsy_probs) return(js.dist) } else if (dist=='correlation') { cor.dist <- 1-cor(healthy_probs, epilepsy_probs) return(cor.dist) } } get_all_jscells <- function(subtype, cell.nr.vec, pseudo.prob=1e-8, dist='js'){ js_nrcell <- cell.nr.vec %>% lapply(do_dist_cells, subtype, pseudo.prob, dist) jscell_df <- bind_cols(value=unlist(js_nrcell), nr.cells=cell.nr.vec) %>% mutate(subtype=subtype) #return(ggplot(jscell_df, aes(y=value, x=nr.cells))+geom_point()+ggtitle(subtype)) } #all_jscells <- annot$subtype %>% unique %>% lapply(get_all_jscells, nr_cells) do_plot <- function(sub.df){ p <- ggplot(sub.df, aes(x=nr.cells, y=value, col=subtype))+geom_line()+ theme(axis.text.x = element_text(angle = -90, hjust = 1)) return(p) } #all_plots <- all_jscells %>% lapply(do_plot) #subl1 <- do.call(rbind, all_jscells[1:5]) #p <- ggplot(all_jscells[[20]], aes(x=nr.cells, y=value, col=subtype))+geom_point()+theme(axis.text.x = element_text(angle = -90, hjust = 1)) #plot_grid(plotlist=all_plots[1:4], nrow=2) subtypes_ <- annot$subtype %>% unique sub_jscells <- get_all_jscells('L2_Lamp5', nr_cells2, dist='js') sub_jscells_rep <- replicate(3, list(get_all_jscells('L2_Lamp5', nr_cells2, dist='js'))) %>% bind_rows p <- sub_jscells_rep %>% ggplot(aes(x=nr.cells, y=value, col=subtype))+geom_point()+theme(axis.text.x = element_text(angle = -90, hjust = 1)) p
nr_genes=seq(100, 10000, 500) do_js_genes <- function(nr.genes, pseudo.prob=1e-8) { od_genes = conos:::getOdGenesUniformly(con_object$samples, nr.genes) healthy_probs <- subtype_split$healthy$L2_Lamp5 %>% SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=100, pseudo.count=pseudo.prob) epilepsy_probs <- subtype_split$epilepsy$L2_Lamp5 %>% SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=100, pseudo.count=pseudo.prob) js.dist <- JensenShannon( healthy_probs, epilepsy_probs) return(js.dist) } js_nrgenes <- nr_genes %>% lapply(do_js_genes) jsgenes_df <- bind_cols(value=unlist(js_nrgenes), nr.genes=nr_genes) ggplot(jsgenes_df, aes(y=value, x=nr.genes))+geom_point()+ggtitle('JSD for L2_Lamp5, using x number of OD genes')
Load DE genes generated with conos (deseq2) and use top x number of genes
de_genez <- readRDS(file.path('/home/larsc/data/eps_10x_de_celltype.rds')) GetTopGenes <- function(de.result, nr, value='gene.names', filter.mito=TRUE){ res.df <- de.result$res res.df <- res.df %>% dplyr::mutate(gene.names = rownames(res.df)) if (filter.mito){ res.df <- dplyr::filter(res.df, !stringr::str_detect(gene.names, "^MT-")) } res.sorted <- res.df %>% arrange(padj) return(res.sorted[[value]][1:nr]) } top_genez <- de_genez %>% lapply(GetTopGenes, 500, filter.mito=FALSE)
100 cells pairwise , top x DE genes
ObtainDistributions <- function(sub.annot, some.genes, rbound.panel, cellid.col, nr.cell, pseudo.count){ return(SelectCellProbs(sub.annot, rbound.panel, cellid.col, nr.cell, some.genes, pseudo.count)) } # just to rearrange the inputs for mapply IndividualCellProbsList <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){ sub.split <- split(annotation, annotation[, sub.col], drop=T) cell.distributions <- mapply(ObtainDistributions, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F) return(cell.distributions) } ProbsToJSD <- function(state.split, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count){ list.of.lists <- state.split %>% lapply(IndividualCellProbsList, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count) all.sc.dists.de <- Map(CalculateAllJSD, list.of.lists$healthy, list.of.lists$epilepsy) return(all.sc.dists.de %>% as_tibble) } # specific for this data but saves some lines all_sc_dists_de <- ProbsToJSD(state_split, rbound_panel, 4, 2, 100, top_genez, 10^(-8)) all_scd_de_gathered <- gather(all_sc_dists_de, key=subtype, value=JSD) ggplot(all_scd_de_gathered, aes(y=JSD, x=subtype)) + geom_violin(aes(col=subtype))+ theme(axis.text.x = element_text(angle = -90, hjust = 1))
Sam's genes, not equal number of genes
de_genes <- readRDS("/d0-mendel/home/demharters/R/projects/UPF9_14_17_19_22_23_24_32_33/de_multilevel_genes.rds") sam_de_count <- de_genes %>% lapply(length) %>% as.data.frame #names(de_genez)==names(de_genes) # just checking CheckOverlap <- function(x,y){ # is it completely off or wut y=y[1:100] return(sum(x %in% y)/100) } check_overlap <- Map(CheckOverlap, top_genez, de_genes) # low for pvalb nos since it only has 16 all_sc_dists_de_sam <- ProbsToJSD(state_split, rbound_panel, 4, 2, 100, de_genes, 10^(-8)) all_scd_de_gathered_sam <- gather(all_sc_dists_de_sam, key=subtype, value=JSD) ggplot(all_scd_de_gathered_sam, aes(y=JSD, x=subtype)) + geom_violin(aes(col=subtype))+ theme(axis.text.x = element_text(angle = -90, hjust = 1))#+geom_jitter(aes(col=subtype), alpha = 0.05, size=0.5)
same number of cells pairwise, same number of OD genes
od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000) healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 100, od_genes, 10^(-8)) epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 100, od_genes, 10^(-8)) all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs) all_sc_dists <- all_singlecell_dists %>% as_tibble all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd) ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_violin(aes(col=subtype))+ theme(axis.text.x = element_text(angle = -90, hjust = 1)) #+geom_jitter(aes(col=subtype), alpha = 0.05, size=0.5)
Constant number of OD genes, constant number of cells, normalize variance.
od_genes = conos:::getOdGenesUniformly(con_object$samples, 100) healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 100, od_genes, 10^(-8), norm.var=T) epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 100, od_genes, 10^(-8), norm.var=T) all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs) all_sc_dists <- all_singlecell_dists %>% as_tibble all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd) ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_violin(aes(col=subtype))+ theme(axis.text.x = element_text(angle = -90, hjust = 1))
pairwise 100 cells, union of most DE genes
top_genes_union <- top_genez %>% unlist %>% unique healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 100, top_genes_union, 10^(-8)) epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 100, top_genes_union, 10^(-8)) all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs) all_sc_dists <- all_singlecell_dists %>% as_tibble all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd) ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_violin(aes(col=subtype))+ theme(axis.text.x = element_text(angle = -90, hjust = 1))
Now Aggregation of cells, same number of cells, 2000 od genes
od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000) od_genes_list <- rep(list(od_genes), 20) IndividualCellProbsAgg <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){ sub.split <- split(annotation, annotation[, sub.col], drop=T) cell.distributions <- mapply(SelectCellProbsAggregated, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F) return(cell.distributions) } sub_probs_agg <- state_split %>% lapply(IndividualCellProbsAgg, rbound_panel, 4, 2, 200, od_genes_list, 10^(-8)) jsd_agg <- Map(JensenShannon, sub_probs_agg$healthy, sub_probs_agg$epilepsy) %>% as_tibble jsd_agg_gathered <- gather(jsd_agg, key=subtype, value=js_distance_cell_agg) ggplot(jsd_agg_gathered, aes(y=js_distance_cell_agg, x=subtype)) +geom_bar(stat='identity', col='hotpink') + theme(axis.text.x = element_text(angle = -90, hjust = 1))
Now Aggregation of cells, each with their own top x
IndividualCellProbsAgg <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){ sub.split <- split(annotation, annotation[, sub.col], drop=T) cell.distributions <- mapply(SelectCellProbsAggregated, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F) return(cell.distributions) } sub_probs_agg <- state_split %>% lapply(IndividualCellProbsAgg, rbound_panel, 4, 2, 100, top_genez, 10^(-8)) jsd_agg <- Map(JensenShannon, sub_probs_agg$healthy, sub_probs_agg$epilepsy) %>% as_tibble jsd_agg_gathered <- gather(jsd_agg, key=subtype, value=js_distance_cell_agg) ggplot(jsd_agg_gathered, aes(y=js_distance_cell_agg, x=subtype)) +geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = -90, hjust = 1))
30 cells, 1000 od genes
od_genes = conos:::getOdGenesUniformly(con_object$samples, 1000) healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 30, od_genes, 10^(-8)) epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 30, od_genes, 10^(-8)) all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs) all_sc_dists <- all_singlecell_dists %>% as_tibble all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd) ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_boxplot(alpha=0.5)+ theme(axis.text.x = element_text(angle = -90, hjust = 1)) #+geom_jitter(aes(col=subtype), alpha = 0.05, size=0.5)
previous cell but more iterations
iterate_sc_js <- function(avg=FALSE){ healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 30, od_genes, 10^(-8)) epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 30, od_genes, 10^(-8)) all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs) if (avg) { all_singlecell_dists <- all_singlecell_dists %>% lapply(mean) } all_sc_dists <- all_singlecell_dists %>% as_tibble list(all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd)) } js_its <- replicate(5, iterate_sc_js()) js_its_df <- do.call(rbind, js_its) ggplot(js_its_df, aes(y=jsd, x=subtype)) + geom_boxplot(alpha=0.5)+ theme(axis.text.x = element_text(angle = -90, hjust = 1))
try averaging the jsd for each iteration
js_its_agg <- replicate(5, iterate_sc_js(avg=T)) js_its_agg_df <- do.call(rbind, js_its_agg) ggplot(js_its_agg_df, aes(y=jsd, x=subtype)) + geom_boxplot(alpha=0.5)+ theme(axis.text.x = element_text(angle = -90, hjust = 1))
aggregate and do some iterations
od_genes = conos:::getOdGenesUniformly(con_object$samples, 1000) od_genes_list <- rep(list(od_genes), 20) domultcellaggs <- function() { sub_probs_agg <- state_split %>% lapply(IndividualCellProbsAgg, rbound_panel, 4, 2, 30, od_genes_list, 10^(-8)) jsd_agg <- Map(JensenShannon, sub_probs_agg$healthy, sub_probs_agg$epilepsy) %>% as_tibble jsd_agg_gathered <- gather(jsd_agg, key=subtype, value=js_distance_cell_agg) return(list(jsd_agg_gathered)) } aggs_jsd <- replicate(10, domultcellaggs()) aggs_jsd_df <- do.call(rbind, aggs_jsd) ggplot(aggs_jsd_df, aes(y=js_distance_cell_agg, x=subtype)) +geom_boxplot()+#geom_jitter()+ theme(axis.text.x = element_text(angle = -90, hjust = 1))
condition_tables
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.