In githubz0r/SecretUtils: Some utils for Conos

Introduction

library(conos)
library(pheatmap)
#library(fuck)
devtools::load_all('/home/larsc/SecretUtils')
require(pagoda2)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(cowplot)
con_object <- readRDS(file.path('/home/larsc/data/10x_preproced_graphed.rds'))
annot <- readRDS(file.path('/home/demharters/R/projects/UPF9_14_17_19_22_23_24_32_33/metadata_10x_final.rds'))

rowbind the adjusted expression values

if (is.null(annot$cellid)) {
  annot$cellid <- annot %>% rownames
}
annot <- annot %>% mutate(subtype_condition = paste(annot$subtype, annot$condition, sep='_'))
rbound_panel <- RbindPanel(con_object)

Are the cell names in the same order?

identical(rownames(rbound_panel), rownames(annot)) # thank god, maybe add a sort for the future in Panelize

Inter sample distance, aggregated

od_genes = conos:::getOdGenesUniformly(con_object$samples, 3000)
# split annot in categories (disease, healthy)
state_split <- split(annot, annot$condition, drop=TRUE)
# get probability distributions for subtype/sample categories
all_probs <- ObtainProbabilities(state_split, rbound_panel, 1, 2, 4, od_genes, pseudo.count=10^(-6))
# now get JS distances
all_distances <- Map(CalculateAllJSD, all_probs$healthy, all_probs$epilepsy)
#all_dists_dfd <- all_distances %>% lapply(function(x){data.frame(comparison=names(x), value=x, row.names=NULL)})
# use stackexchange magic to pad NAs to shorter distance vectors, and also tibbling
all_dists <- lapply(all_distances, `length<-`, max(lengths(all_distances))) %>% as_tibble
all_dists <- all_dists %>% mutate(comparison=names(all_distances$Id2_Reln))
# gather into format for easy ggplot
all_dists_gathered <- gather(all_dists, key=subtype, value=js_distance, -comparison)
# plot
ggplot(all_dists_gathered, aes(y=js_distance, x=subtype)) + geom_violin()+geom_point(aes(col=comparison), alpha = 0.6, size=1.8) +
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

What about within condition distances (jensen shannon)

getUpperTri <- function(dist.numeric){
  dist.vec <- as.vector(dist.numeric)
  dist.mat <- dist.vec %>% matrix(nrow=sqrt(dist.vec %>% length))
  name.mat <- names(dist.numeric) %>% matrix(nrow=sqrt(dist.vec %>% length))
  comp.names <- name.mat[upper.tri(name.mat)]
  vals <- dist.mat[upper.tri(dist.mat)]
  names(vals) <- comp.names
  return(vals)
}

Dataframize <- function(vector.name, a.list){
  named.vector <- a.list[[vector.name]]
  init.df <- bind_cols(value=named.vector, comparison=names(named.vector))
  full.df <- init.df %>% mutate(subtype=vector.name)
  return(full.df)
}

InterSampleDists <- function(cond.sub.samples){
  inter.dists <- Map(CalculateAllJSD, cond.sub.samples, cond.sub.samples)
  inter.dists <- inter.dists %>% lapply(getUpperTri)
  inter.dists <- do.call(rbind, inter.dists %>% names %>% lapply(Dataframize, inter.dists))
  return(inter.dists)
}

Dataframize2 <- function(between.vector.name, list.of.vectors){
  between.vector <- list.of.vectors[[between.vector.name]]
  comparison <- between.vector %>% names
  df <- (bind_cols(value=between.vector, comparison=comparison))
  df$subtype <- between.vector.name
  return(df)
}
all_probs <- ObtainProbabilities(state_split, rbound_panel, 1, 2, 4, od_genes, pseudo.count=10^(-6))
eps_distances <- Map(CalculateAllJSD, all_probs$epilepsy, all_probs$epilepsy) 
health_distances <- Map(CalculateAllJSD, all_probs$healthy, all_probs$healthy)
between_distances <- Map(CalculateAllJSD, all_probs$healthy, all_probs$epilepsy)
# if we want to try viktors JS plot we replace the rbound panel with neighbour smoothed
# and we replace sample probabilities with individual cell probabilities

eps_dist2 <- eps_distances %>% lapply(getUpperTri)
health_dist2 <- health_distances %>% lapply(getUpperTri)
between_dist2 <- names(between_distances) %>% lapply(Dataframize2, between_distances) %>% bind_rows

eps_dist3 <- do.call(rbind, eps_dist2 %>% names %>% lapply(Dataframize, eps_dist2))
health_dist3 <- do.call(rbind, health_dist2 %>% names %>% lapply(Dataframize, health_dist2))


combined_dists <- rbind(eps_dist3, health_dist3) %>% mutate(condition = c(rep('epilepsy', 60), rep('healthy', 114)))
combined_dists2 <- rbind(combined_dists, mutate(between_dist2, condition='between'))
combined_dists2 %>% ggplot(aes(x=subtype, y=value, col=condition))+geom_boxplot()+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

Overall JS, all sample-conds aggregated let's see overall JS dist.

od_genes = conos:::getOdGenesUniformly(con_object$samples, 1000)
state_split <- split(annot, annot$condition, drop=TRUE)
subtype_split <- state_split %>% lapply(function(x){split(x, x$subtype, drop=TRUE)})
healthy_probs <- subtype_split$healthy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10))
epilepsy_probs <- subtype_split$epilepsy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10))

all_dists <- Map(JensenShannon, healthy_probs, epilepsy_probs) %>% as_tibble
all_dists_gathered <- gather(all_dists, key=subtype, value=js_distance)
ggplot(all_dists_gathered, aes(y=js_distance, x=subtype)) +geom_bar(stat='identity', col='hotpink') +
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

JS distance for each subtype (i.e. all samples aggregated) as function of the minimum (or max) number of cells

od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000)
state_split <- split(annot, annot$condition, drop=TRUE)
condition_tables <- state_split %>% lapply(function(x){table(x$subtype)})
condition_df <- condition_tables %>% as.data.frame
condition_df <- condition_df %>% mutate(min_cells = mapply(min, epilepsy.Freq, healthy.Freq))
condition_df <- condition_df %>% mutate(max_cells = mapply(max, epilepsy.Freq, healthy.Freq))
subtype_split <- state_split %>% lapply(function(x){split(x, x$subtype, drop=TRUE)})
healthy_probs <- subtype_split$healthy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10))
epilepsy_probs <- subtype_split$epilepsy %>% GetSampProbs(rbound_panel, od_genes, cellid.col = 4, pseudo.count=10^(-10))

all_dists <- Map(JensenShannon, healthy_probs, epilepsy_probs) %>% as_tibble
all_dists_gathered <- gather(all_dists, key=subtype, value=js_distance) %>% 
  mutate(min_cells = condition_df$min_cells, max_cells = condition_df$max_cells)
ggplot(all_dists_gathered, aes(y=js_distance, x=min_cells, col=subtype)) +geom_point() +
  theme(axis.text.x = element_text(angle = -90, hjust = 1))
ggplot(all_dists_gathered, aes(y=js_distance, x=max_cells, col=subtype)) +geom_point() +
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

Dependency for single subtypes

od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000)
nr_cells <- c(30, 60, seq(100, 1000, by=50))
nr_cells2 <- (seq(30,300,by=15))
do_dist_cells <- function(nr.cell, subtype, pseudo.prob=1e-8, dist=js) {
  healthy_probs <- subtype_split$healthy[[subtype]] %>% 
    SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=nr.cell, pseudo.count=pseudo.prob)
  epilepsy_probs <- subtype_split$epilepsy[[subtype]] %>% 
    SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=nr.cell, pseudo.count=pseudo.prob)
  if (dist=='js') {
    js.dist <- JensenShannon( healthy_probs, epilepsy_probs)
    return(js.dist)
  } else if (dist=='correlation') {
    cor.dist <- 1-cor(healthy_probs, epilepsy_probs)
    return(cor.dist)
  }
}

get_all_jscells <- function(subtype, cell.nr.vec, pseudo.prob=1e-8, dist='js'){
  js_nrcell <- cell.nr.vec %>% lapply(do_dist_cells, subtype, pseudo.prob, dist)
  jscell_df <- bind_cols(value=unlist(js_nrcell), nr.cells=cell.nr.vec) %>% mutate(subtype=subtype)
  #return(ggplot(jscell_df, aes(y=value, x=nr.cells))+geom_point()+ggtitle(subtype))
}

#all_jscells <- annot$subtype %>% unique %>% lapply(get_all_jscells, nr_cells)

do_plot <- function(sub.df){
  p <- ggplot(sub.df, aes(x=nr.cells, y=value, col=subtype))+geom_line()+
    theme(axis.text.x = element_text(angle = -90, hjust = 1))
  return(p)
}

#all_plots <- all_jscells %>% lapply(do_plot)
#subl1 <- do.call(rbind, all_jscells[1:5])

#p <- ggplot(all_jscells[[20]], aes(x=nr.cells, y=value, col=subtype))+geom_point()+theme(axis.text.x = element_text(angle = -90, hjust = 1))
#plot_grid(plotlist=all_plots[1:4], nrow=2)
subtypes_ <- annot$subtype %>% unique
sub_jscells <- get_all_jscells('L2_Lamp5', nr_cells2, dist='js')
sub_jscells_rep <- replicate(3, list(get_all_jscells('L2_Lamp5', nr_cells2, dist='js'))) %>% bind_rows
p <- sub_jscells_rep %>% ggplot(aes(x=nr.cells, y=value, col=subtype))+geom_point()+theme(axis.text.x = element_text(angle = -90, hjust = 1))
p

nr_genes=seq(100, 10000, 500)
do_js_genes <- function(nr.genes, pseudo.prob=1e-8) {
  od_genes = conos:::getOdGenesUniformly(con_object$samples, nr.genes)
  healthy_probs <- subtype_split$healthy$L2_Lamp5 %>% 
    SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=100, pseudo.count=pseudo.prob)
  epilepsy_probs <- subtype_split$epilepsy$L2_Lamp5 %>% 
    SelectCellProbsAggregated(od_genes, rbound_panel, cellid.col = 4, nr.cell=100, pseudo.count=pseudo.prob)
  js.dist <- JensenShannon( healthy_probs, epilepsy_probs)
  return(js.dist)
}
js_nrgenes <- nr_genes %>% lapply(do_js_genes)
jsgenes_df <- bind_cols(value=unlist(js_nrgenes), nr.genes=nr_genes)
ggplot(jsgenes_df, aes(y=value, x=nr.genes))+geom_point()+ggtitle('JSD for L2_Lamp5, using x number of OD genes')

Load DE genes generated with conos (deseq2) and use top x number of genes

de_genez <- readRDS(file.path('/home/larsc/data/eps_10x_de_celltype.rds'))

GetTopGenes <- function(de.result, nr, value='gene.names', filter.mito=TRUE){
  res.df <- de.result$res
  res.df <- res.df %>% dplyr::mutate(gene.names = rownames(res.df))
  if (filter.mito){
    res.df <- dplyr::filter(res.df, !stringr::str_detect(gene.names, "^MT-"))
  }
  res.sorted <- res.df %>% arrange(padj)
  return(res.sorted[[value]][1:nr])
}

top_genez <- de_genez %>% lapply(GetTopGenes, 500, filter.mito=FALSE)

100 cells pairwise , top x DE genes

ObtainDistributions <- function(sub.annot, some.genes, rbound.panel, cellid.col, nr.cell, pseudo.count){
    return(SelectCellProbs(sub.annot, rbound.panel, cellid.col, nr.cell, some.genes, pseudo.count))
  } # just to rearrange the inputs for mapply

IndividualCellProbsList <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){
  sub.split <- split(annotation, annotation[, sub.col], drop=T)
  cell.distributions <- mapply(ObtainDistributions, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F)
  return(cell.distributions)
}
ProbsToJSD <- function(state.split, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count){
  list.of.lists <- state.split %>% lapply(IndividualCellProbsList, rbound.panel, cellid.col, sub.col, 
                                          nr.cell, genes.list, pseudo.count)
  all.sc.dists.de <- Map(CalculateAllJSD, list.of.lists$healthy, list.of.lists$epilepsy)
  return(all.sc.dists.de %>% as_tibble)
} # specific for this data but saves some lines

all_sc_dists_de <- ProbsToJSD(state_split, rbound_panel, 4, 2, 100, top_genez, 10^(-8))
all_scd_de_gathered <- gather(all_sc_dists_de, key=subtype, value=JSD)
ggplot(all_scd_de_gathered, aes(y=JSD, x=subtype)) + geom_violin(aes(col=subtype))+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

Sam's genes, not equal number of genes

de_genes <- readRDS("/d0-mendel/home/demharters/R/projects/UPF9_14_17_19_22_23_24_32_33/de_multilevel_genes.rds")
sam_de_count <- de_genes %>% lapply(length) %>% as.data.frame
#names(de_genez)==names(de_genes) # just checking
CheckOverlap <- function(x,y){ # is it completely off or wut
  y=y[1:100]
  return(sum(x %in% y)/100)
}
check_overlap <- Map(CheckOverlap, top_genez, de_genes) # low for pvalb nos since it only has 16
all_sc_dists_de_sam <- ProbsToJSD(state_split, rbound_panel, 4, 2, 100, de_genes, 10^(-8))
all_scd_de_gathered_sam <- gather(all_sc_dists_de_sam, key=subtype, value=JSD)
ggplot(all_scd_de_gathered_sam, aes(y=JSD, x=subtype)) + geom_violin(aes(col=subtype))+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))#+geom_jitter(aes(col=subtype), alpha = 0.05, size=0.5)

same number of cells pairwise, same number of OD genes

od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000)
healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 100, od_genes, 10^(-8))
epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 100, od_genes, 10^(-8))
all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs)
all_sc_dists <- all_singlecell_dists %>% as_tibble
all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd)
ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_violin(aes(col=subtype))+
  theme(axis.text.x = element_text(angle = -90, hjust = 1)) #+geom_jitter(aes(col=subtype), alpha = 0.05, size=0.5)

Constant number of OD genes, constant number of cells, normalize variance.

od_genes = conos:::getOdGenesUniformly(con_object$samples, 100)
healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 100, od_genes, 10^(-8), norm.var=T)
epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 100, od_genes, 10^(-8), norm.var=T)
all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs)
all_sc_dists <- all_singlecell_dists %>% as_tibble
all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd)
ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_violin(aes(col=subtype))+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

pairwise 100 cells, union of most DE genes

top_genes_union <- top_genez %>% unlist %>% unique
healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 100, top_genes_union, 10^(-8))
epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 100, top_genes_union, 10^(-8))
all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs)
all_sc_dists <- all_singlecell_dists %>% as_tibble
all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd)
ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_violin(aes(col=subtype))+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

Now Aggregation of cells, same number of cells, 2000 od genes

od_genes = conos:::getOdGenesUniformly(con_object$samples, 2000)
od_genes_list <- rep(list(od_genes), 20)

IndividualCellProbsAgg <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){
  sub.split <- split(annotation, annotation[, sub.col], drop=T)
  cell.distributions <- mapply(SelectCellProbsAggregated, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F)
  return(cell.distributions)
}
sub_probs_agg <- state_split %>% lapply(IndividualCellProbsAgg, rbound_panel, 4, 2, 200, od_genes_list, 10^(-8))
jsd_agg <- Map(JensenShannon, sub_probs_agg$healthy, sub_probs_agg$epilepsy) %>% as_tibble
jsd_agg_gathered <- gather(jsd_agg, key=subtype, value=js_distance_cell_agg)
ggplot(jsd_agg_gathered, aes(y=js_distance_cell_agg, x=subtype)) +geom_bar(stat='identity', col='hotpink') +
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

Now Aggregation of cells, each with their own top x

IndividualCellProbsAgg <- function(annotation, rbound.panel, cellid.col, sub.col, nr.cell, genes.list, pseudo.count=0){
  sub.split <- split(annotation, annotation[, sub.col], drop=T)
  cell.distributions <- mapply(SelectCellProbsAggregated, sub.split, genes.list, MoreArgs=list(rbound.panel, cellid.col, nr.cell, pseudo.count), SIMPLIFY=F)
  return(cell.distributions)
}
sub_probs_agg <- state_split %>% lapply(IndividualCellProbsAgg, rbound_panel, 4, 2, 100, top_genez, 10^(-8))
jsd_agg <- Map(JensenShannon, sub_probs_agg$healthy, sub_probs_agg$epilepsy) %>% as_tibble
jsd_agg_gathered <- gather(jsd_agg, key=subtype, value=js_distance_cell_agg)
ggplot(jsd_agg_gathered, aes(y=js_distance_cell_agg, x=subtype)) +geom_bar(stat='identity') +
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

30 cells, 1000 od genes

od_genes = conos:::getOdGenesUniformly(con_object$samples, 1000)
healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 30, od_genes, 10^(-8))
epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 30, od_genes, 10^(-8))
all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs)
all_sc_dists <- all_singlecell_dists %>% as_tibble
all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd)
ggplot(all_scd_gathered, aes(y=jsd, x=subtype)) + geom_boxplot(alpha=0.5)+
  theme(axis.text.x = element_text(angle = -90, hjust = 1)) #+geom_jitter(aes(col=subtype), alpha = 0.05, size=0.5)

previous cell but more iterations

iterate_sc_js <- function(avg=FALSE){
  healthcellprobs <- IndividualCellProbs(state_split$healthy, rbound_panel, 4, 2, 30, od_genes, 10^(-8))
  epscellprobs <- IndividualCellProbs(state_split$epilepsy, rbound_panel, 4, 2, 30, od_genes, 10^(-8))
  all_singlecell_dists <- Map(CalculateAllJSD, healthcellprobs, epscellprobs)
   if (avg) {
    all_singlecell_dists <- all_singlecell_dists %>% lapply(mean)
  }
  all_sc_dists <- all_singlecell_dists %>% as_tibble
  list(all_scd_gathered <- gather(all_sc_dists, key=subtype, value=jsd))
}
js_its <- replicate(5, iterate_sc_js())
js_its_df <- do.call(rbind, js_its)
ggplot(js_its_df, aes(y=jsd, x=subtype)) + geom_boxplot(alpha=0.5)+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

try averaging the jsd for each iteration

js_its_agg <- replicate(5, iterate_sc_js(avg=T))
js_its_agg_df <- do.call(rbind, js_its_agg)
ggplot(js_its_agg_df, aes(y=jsd, x=subtype)) + geom_boxplot(alpha=0.5)+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))

aggregate and do some iterations

od_genes = conos:::getOdGenesUniformly(con_object$samples, 1000)
od_genes_list <- rep(list(od_genes), 20)

domultcellaggs <- function() {
  sub_probs_agg <- state_split %>% lapply(IndividualCellProbsAgg, rbound_panel, 4, 2, 30, od_genes_list, 10^(-8))
  jsd_agg <- Map(JensenShannon, sub_probs_agg$healthy, sub_probs_agg$epilepsy) %>% as_tibble
  jsd_agg_gathered <- gather(jsd_agg, key=subtype, value=js_distance_cell_agg)
  return(list(jsd_agg_gathered))
}
aggs_jsd <- replicate(10, domultcellaggs())
aggs_jsd_df <- do.call(rbind, aggs_jsd)
ggplot(aggs_jsd_df, aes(y=js_distance_cell_agg, x=subtype)) +geom_boxplot()+#geom_jitter()+
  theme(axis.text.x = element_text(angle = -90, hjust = 1))