In githubz0r/SecretUtils: Some utils for Conos

library(conos)
library(tidyverse)
devtools::load_all('/home/larsc/SecretUtils')
library(cowplot)
library(splatter)
devtools::load_all('/home/viktor_petukhov/Copenhagen/NeuronalMaturation')
params <- readRDS('/home/larsc/data/splatter_lamp5_params.rds')

Make data

group_prob <- rep(1/6, 6)
de_prob <- c(0.0, 0.0, 0.2, 0.3, 0.4, 0.5)
ncellvec <- c(30, 100, 200, 500, 1000)
ngenevec <- c(100, 1000, 5000, 10000, 20000)
de_prob <- c(0.0, 0.0, 0.2, 0.3, 0.4, 0.5)
liblocvec <- c(6.5, 7, 7.5, 8, 8.5)
seeds <- c(22071, 666, 9001)
#ncellsims <- seeds %>% pbapply::pblapply(function(x){lapplyCells(ncellvec, x, de.probs=de_prob)}, cl=6)
#ngenesims <- seeds %>% pbapply::pblapply(function(x){lapplyGenes(ngenevec, x, de.probs=de_prob)}, cl=6)
#liblocsims <- seeds %>% pbapply::pblapply(function(x){lapplyLibloc(liblocvec, x, de.probs=de_prob)}, cl=6)

Generate paga dfs and p2 objects

#cellpagas_p2s <- ncellsims %>% pbapply::pblapply(SimPagaFactor, 'ncell', return.p2=T, cl=1)
#genepagas_p2s <- ngenesims %>% pbapply::pblapply(SimPagaFactor, 'ngenes', return.p2=T, cl=1)
#liblocpagas_p2s <- liblocsims %>% pbapply::pblapply(SimPagaFactor, 'libloc', return.p2=T, cl=1)

Save objects in convenient form

bindp2annot <- function(p2.paga.list, cm.annot.list) {
  p2s <- p2.paga.list$p2s; annots <- cm.annot.list$annots
  return(list(p2s=p2s, annots=annots))
}
#cellp2s_anns_unbound <- Map(bindp2annot, cellpagas_p2s, ncellsims); names(cellp2s_anns_unbound) <- seeds
#genep2s_anns_unbound <- Map(bindp2annot, genepagas_p2s, ngenesims); names(genep2s_anns_unbound) <- seeds
#liblocp2s_anns_unbound <- Map(bindp2annot, liblocpagas_p2s, liblocsims); names(liblocp2s_anns_unbound) <- seeds
#saveRDS(cellp2s_anns_unbound, '/home/larsc/data/splatter_data/cellp2s_anns_unbound.rds')
#saveRDS(genep2s_anns_unbound, '/home/larsc/data/splatter_data/genep2s_anns_unbound.rds')
#saveRDS(liblocp2s_anns_unbound, '/home/larsc/data/splatter_data/liblocp2s_anns_unbound.rds')

cellp2s_anns_unbound <- readRDS('/home/larsc/data/splatter_data/cellp2s_anns_unbound.rds')
genep2s_anns_unbound <- readRDS('/home/larsc/data/splatter_data/genep2s_anns_unbound.rds')
liblocp2s_anns_unbound <- readRDS('/home/larsc/data/splatter_data/liblocp2s_anns_unbound.rds')

cellbound <- cellp2s_anns_unbound %>% lapply(SimPagaFactor, 'ncell') %>% bind_rows
genebound <- genep2s_anns_unbound %>% lapply(SimPagaFactor, 'ngenes') %>% bind_rows
liblocbound <- liblocp2s_anns_unbound %>%  lapply(SimPagaFactor, 'libloc') %>% bind_rows

checking that low libloc has higher dropout

liblocp2s_anns_unbound$`22071`$p2s %>% lapply(function(x){x$misc$rawCounts %>% countzerocols})

#cellbound <- cellpagas_p2s %>% lapply(function(x){x$paga.df}) %>% bind_rows
#genebound <- genepagas_p2s %>% lapply(function(x){x$paga.df}) %>% bind_rows
#liblocbound <- liblocpagas_p2s %>% lapply(function(x){x$paga.df}) %>% bind_rows


cellbound %>% filter(de.levels!='ref') %>% ggplot(aes(x=ncell, y=paga.connectivity.value, col=de.levels))+
  geom_point(size=1, alpha=0.8)
genebound %>% filter(de.levels!='ref') %>% ggplot(aes(x=ngenes, y=paga.connectivity.value, col=de.levels))+
  geom_point(size=1, alpha=0.8)
liblocbound %>% filter(de.levels!='ref') %>% ggplot(aes(x=libloc, y=paga.connectivity.value, col=de.levels))+
  geom_point(size=1, alpha=0.8)

Correlation

getpcacorperseed <- function(p2s.anns, factor.class){
  pca.cms <- p2s.anns$p2s %>% lapply(function(x){x$reductions$PCA})
  cor.dist.df <- doSimCor(pca.cms, p2s.anns$annots, factor.class)
}
cellcordists <- cellp2s_anns_unbound %>% lapply(getpcacorperseed, 'ncell') %>% bind_rows
genecordists <- genep2s_anns_unbound %>% lapply(getpcacorperseed, 'ngenes') %>% bind_rows
libloccordists <- liblocp2s_anns_unbound %>% lapply(getpcacorperseed, 'libloc') %>% bind_rows

cellcordists %>% filter(de.levels!='ref') %>% ggplot(aes(x=ncell, y=correlation.distance, col=de.levels))+
  geom_point(size=1, alpha=0.8)
genecordists %>% filter(de.levels!='ref') %>% ggplot(aes(x=ngenes, y=correlation.distance, col=de.levels))+
  geom_point(size=1, alpha=0.8)
libloccordists %>% filter(de.levels!='ref') %>% ggplot(aes(x=libloc, y=correlation.distance, col=de.levels))+
  geom_point(size=1, alpha=0.8)

testing corr

testcellcor <- cellp2s_anns_unbound$`22071`
testcm <- testcellcor$p2s$`500`$reductions$PCA
testannot <- testcellcor$annots$`500` %>% ExtendFactorAnnot(500, 'ncell')
testsplit <- testannot$cellid %>% split(testannot$de.level)
all.equal(testcm %>% rownames, testannot$cellid)
submatscor <- testsplit %>% lapply(function(x){testcm[x,]}) %>% lapply(Matrix::colMeans)
testcors <- submatscor %>% lapply(function(x,reference){1-cor(x,reference)}, reference=submatscor$ref)
cor_500_from_test <- testcors %>% unlist
cor_500_from_function <- filter(cellcordists, ncell==500)$correlation.distance[1:6]
all.equal(cor_500_from_function, cor_500_from_test %>% unname)

# still looks suspicious
testcm2 <- cellp2s_anns_unbound$`22071`$p2s$`30`$reductions$PCA
testannot2 <- cellp2s_anns_unbound$`22071`$annots$`30`
testannot2_2 <- testannot2 %>% ExtendFactorAnnot(30, 'ncell')
testsplit2 <- testannot2_2 %>% split(testannot2_2$de.level)
submatscor2 <- testsplit2 %>% lapply(function(x){testcm2[x$cellid,]}) %>% lapply(Matrix::colMeans)
submatscor2 %>% lapply(function(x, reference=submatscor2$ref){1-cor(x,reference)}) %>% unlist

Sanity check for PAGA, bind the cell mat and plot

cellrep1 <- cellp2s_anns_unbound$`22071`
mats <- cellrep1$p2s %>% lapply(function(x){x$misc$rawCounts}) %>% do.call(rbind,.)
boundp2 <- t(mats) %>% NeuronalMaturation::GetPagoda(n.odgenes=3000)
ncell_levels <- (cellrep1$annots %>% lapply(function(x){x$ncell %>% unique})) %>% unlist
annots_extended <- Map(ExtendFactorAnnot, cellrep1$annots, ncell_levels, MoreArgs=(list('ncell')))
boundannot <- annots_extended %>% bind_rows
partitions <- paste0(boundannot$ncell, '_', boundannot$de.level) %>% str_sort(numeric=TRUE) %>% unique

boundannot$partitions <- factor(paste0(boundannot$ncell, '_', boundannot$de.level), levels=partitions)
#boundannot$partitions <- factor(paste0(boundannot$ncell, '_', boundannot$de.level))
adjgraph <- igraph::as_adjacency_matrix(boundp2$graphs$PCA, attr='weight')[boundannot$cellid, boundannot$cellid]
connectivities <- GeneratePagaItems(adjgraph, sample.vector=boundannot$partitions, by.samples=T)
connectivities <- connectivities$connectivities#[partitions, partitions]
connectivities2 <- GetPagaMatrix(adjgraph, boundannot$partitions %>% as.numeric)
inds <- 1:30 %>% split(seq(1,30, by=5)) %>% as.data.frame %>% t %>% as.matrix %>% as.data.frame
submats <- inds %>% lapply(function(x){connectivities[x, x]})
names(submats) <- ncell_levels
cons <- Map(SimPaga4, submats, annots_extended, ncell_levels, MoreArgs=list('ncell'))
cons %>% bind_rows %>% filter(de.levels!='ref') %>% ggplot(aes(x=ncell, y=paga.connectivity.value, col=de.levels))+
  geom_point(size=1, alpha=0.8)

groupgroup <- setNames(boundannot$Group, boundannot$cellid)
degroup <- setNames(boundannot$de.level, boundannot$cellid)
partitiongrp <- setNames(boundannot$partitions, boundannot$cellid)
conos:::plotSamples(list(boundp2), groups=groupgroup, shuffle.colors=T, font.size=c(2,5),
                    show.legend=T, size=0.4)

# could also try correlation
boundpcamat <- boundp2$reductions$PCA
boundpcasplits <- annots_extended %>% lapply(function(x){boundpcamat[x$cellid,]})
boundcorsplits <- doSimCor(boundpcasplits, cellrep1$annots, 'ncell')


cons %>% bind_rows %>% filter(de.levels!='ref') %>% ggplot(aes(x=ncell, y=paga.connectivity.value, col=de.levels))+
  geom_point(size=1, alpha=0.8)
boundcorsplits %>% filter(de.levels!='ref') %>% ggplot(aes(x=ncell, y=correlation.distance, col=de.levels))+
  geom_point(size=1, alpha=0.8)
conos:::plotSamples(list(boundp2), groups=groupgroup, shuffle.colors=T, font.size=c(2,5),
                    show.legend=T, size=0.4)

#connectivities
#connectivities2
#cons %>% bind_rows %>% filter(de.levels!='ref') %>% ggplot(aes(x=ncell, y=paga.connectivity.value, col=de.levels))+
  #geom_point(size=1, alpha=0.8)

Let's take a quick look at the tsnes of one of the cell reps and compare with the bound tsne (in final report we should probably do this for all the combinations)

# unbound
testp2s <- cellp2s_anns_unbound$`22071`$p2s
for (x in testp2s){x$getEmbedding(type = "PCA", perplexity = 30, embeddingType = "tSNE",
                   max_iter = 1000, distance = 'cosine')}
testannot <- cellp2s_anns_unbound$`22071`$annots %>% bind_rows
grpgrp <- setNames(testannot$Group, testannot$cellid)

# bound tsne 
boundp2$getEmbedding(type = "PCA", perplexity = 30, embeddingType = "tSNE",
                   max_iter = 1000, distance = 'cosine')
ncellgrp <- setNames(testannot$ncell, testannot$cellid)
conos:::plotSamples(testp2s, groups=ncellgrp, shuffle.colors=F, font.size=c(3),
                    show.legend=F, size=0.4)
conos:::plotSamples(list(boundp2), groups=ncellgrp, shuffle.colors=F, font.size=c(3),
                    show.legend=F, size=0.4)
conos:::plotSamples(testp2s, groups=grpgrp, shuffle.colors=F, font.size=c(3),
                    show.legend=F, size=0.4)
conos:::plotSamples(list(boundp2), groups=grpgrp, shuffle.colors=F, font.size=c(3),
                    show.legend=F, size=0.4)

Let's see if this difference in dependency between unbound matrices and separate matrices PCA'd persists using batches splatSimulateGroups has 3 methods, single, groups and paths

# test
# Simulation with big batch effects
batch_faclocs <- c(0, 0, 0.1, 0.15, 0.2, 0.25) # batch 1 is reference
ncellbatch <- c(30,100,200,500,1000)
#sim2 <- splatSimulateGroups(params, batchCells = c(1830, 1830, 1830, 1830, 1830, 1830), group.prob=c(30,100,200,500,1000)/1830,
                      #batch.facLoc = c(0, 0, 0.1, 0.15, 0.2, 0.25), batch.facScale = c(0.001, 0.001, 0.1, 0.1, 0.1, 0.1), 
                      #de.prob=rep(0,5), verbose = FALSE)

#sim2 <- scater::normalize(sim2)
#scater::plotPCA(sim2, colour_by = "Batch") + ggtitle("Big batch effects")

#batchcm <- counts(sim2)
#batchannot <- sim2@colData
#batchannot$Cell <- batchannot$Cell %>% as.character # always remember this bullshit1
# extend annotation and make sub matrices
ExtendBatchAnnot <- function(annot, facloc.vec, ncell.vec){
  #browser()
  facloc.switch <- setNames(facloc.vec, annot$Batch %>% unique %>% sort)
  batchcell.switch <- setNames(ncell.vec, annot$Group %>% unique %>% sort)
  faclocs <- facloc.switch[annot$Batch]
  ncells <- batchcell.switch[annot$Group]
  annot$faclocs <- faclocs
  annot$ncell <- ncells
  return(annot)
}
#batchannot_ext <- ExtendBatchAnnot(batchannot, batch_faclocs, ncellbatch)
#batch_cellsplits <- batchannot_ext %>% split(batchannot_ext$ncell)
#batchsubmats <- batch_cellsplits %>% lapply(function(x){batchcm[, x$Cell]})

# generate p2s
#batchfullp2 <- NeuronalMaturation::GetPagoda(batchcm, n.odgenes=3000)
#batchsubp2s <- batchsubmats %>% lapply(function(x){x %>% NeuronalMaturation::GetPagoda(n.odgenes=3000)})

# save objects
#saveRDS(batchfullp2, '/home/larsc/data/splatter_data/batchvalidfullp2.rds')
#saveRDS(batchsubp2s, '/home/larsc/data/splatter_data/batchvalidsubp2s.rds')
#saveRDS(batchannot_ext, '/home/larsc/data/splatter_data/batchvalidannot.rds')

batchfullp2 <- readRDS('/home/larsc/data/splatter_data/batchvalidfullp2.rds')
batchsubp2s <- readRDS('/home/larsc/data/splatter_data/batchvalidsubp2s.rds')
batchannot_ext <- readRDS('/home/larsc/data/splatter_data/batchvalidannot.rds')
batch_cellsplits <- batchannot_ext %>% split(batchannot_ext$ncell)

# plot
batchgrp <- setNames(batchannot_ext$Batch, batchannot_ext$Cell)
ncellgrp <- setNames(batchannot_ext$ncell, batchannot_ext$Cell)
#conos:::plotSamples(list(batchfullp2), groups=batchgrp, shuffle.colors=T, font.size=c(2,5),
                    #show.legend=T, size=0.4)
#conos:::plotSamples(batchsubp2s, groups=batchgrp, shuffle.colors=T, font.size=c(2,5),
                    #show.legend=F, size=0.4)
#conos:::plotSamples(list(batchfullp2), groups=ncellgrp, shuffle.colors=T, font.size=c(2,5),
                    #show.legend=T, size=0.4)
#conos:::plotSamples(batchsubp2s, groups=ncellgrp, shuffle.colors=F, font.size=c(2,5),
                    #show.legend=F, size=0.4)


SimPagaForSubs <- function(graph, annot, is.connectivity=F){
  if (!is.connectivity) {
    sample.connectivity <- GeneratePagaItems(graph, sample.vector=annot$Batch, by.sample=T)
    connectivity.mat <- sample.connectivity$connectivities
  } else {
    connectivity.mat <- graph
  }
  batch.levels <- (paste0(annot$Batch) %>% unique)[order(paste0(annot$Batch) %>% unique)]
  n.levels <- length(batch.levels)
  comparisons <- connectivity.mat[1, ] %>% setNames(n.levels)
  ncell.batchlevels <- annot$Batch %>% split(annot$Batch) %>% lapply(length) %>% unlist
  ncell.true <- ncell.batchlevels+ncell.batchlevels[1]
  df <- dplyr::bind_cols(paga.connectivity.value=comparisons, batch=batch.levels, ncell.comparison=ncell.true,
                         ncell=rep(annot$ncell %>% unique, n.levels))
  return(df)
}



# quick sanity check with paga
#(paste0(batch_cellsplits$`30`$Batch) %>% unique)[order(paste0(batch_cellsplits$`30`$Batch) %>% unique)]
#testgraph <- igraph::as_adjacency_matrix(batchsubp2s$`1000`$graphs$PCA,
                                         #attr='weight')[batch_cellsplits$`1000`$Cell,batch_cellsplits$`1000`$Cell]
#funcres <- SimPagaForSubs(testgraph, batch_cellsplits$`1000`)
#testannot1000 <- batch_cellsplits$`1000`
#membershiptest <- testannot1000$Batch %>% as.factor %>% as.numeric
#testannot1000$Batch %>% as.factor %>% levels # correct
#testcon1000 <- GetPagaMatrix(testgraph, membershiptest)
#testcon1000[1,] # correct

# let's do the full mat first
batchfullgraph <- igraph::as_adjacency_matrix(batchfullp2$graphs$PCA,
                                         attr='weight')[batchannot_ext$Cell,batchannot_ext$Cell]
batchfulllevels <- paste(batchannot_ext$ncell, batchannot_ext$Batch) %>% str_sort(numeric=TRUE) %>% unique
batchfullpartition <- factor(paste(batchannot_ext$ncell, batchannot_ext$Batch), levels=batchfulllevels)
batchfullmem <- batchfullpartition %>% as.numeric
batchpaga_full <- GetPagaMatrix(batchfullgraph, batchfullmem)
inds <- 1:30 %>% split(seq(1,30, by=5)) %>% as.data.frame %>% t %>% as.matrix %>% as.data.frame %>% as.list
batchpagafullsubmats <- inds %>% lapply(function(x){batchpaga_full[x,x]}) 
names(batchpagafullsubmats) <- batchannot_ext$ncell %>% unique %>% as.numeric %>% sort

# individual mats now
SubBatchCons <- function(p2, annot){
  graph <- igraph::as_adjacency_matrix(p2$graphs$PCA, attr='weight')[annot$Cell, annot$Cell]
  membership.vec <- annot$Batch %>% as.factor %>% as.numeric
  paga.con <- GetPagaMatrix(graph, membership.vec)
  return(paga.con)
}
batchsubcons <- Map(SubBatchCons, batchsubp2s, batch_cellsplits)

pagas_onemat <- Map(SimPagaForSubs, batchpagafullsubmats, batch_cellsplits, MoreArgs=list(is.connectivity=T)) %>% bind_rows
pagas_manymats <- Map(SimPagaForSubs, batchsubcons, batch_cellsplits, MoreArgs=list(is.connectivity=T)) %>% bind_rows
pagas_onemat %>% filter(batch!='Batch1') %>% ggplot(aes(x=ncell, y=paga.connectivity.value, color=batch))+geom_point()
pagas_manymats %>% filter(batch!='Batch1') %>% ggplot(aes(x=ncell, y=paga.connectivity.value, color=batch))+geom_point()

conos:::plotSamples(list(batchfullp2), groups=batchgrp, shuffle.colors=T, font.size=c(2,5),
                    show.legend=T, size=0.4)
conos:::plotSamples(batchsubp2s, groups=batchgrp, shuffle.colors=T, font.size=c(2,5),
                    show.legend=F, size=0.4)
conos:::plotSamples(list(batchfullp2), groups=ncellgrp, shuffle.colors=T, font.size=c(2,5),
                    show.legend=T, size=0.4)
conos:::plotSamples(batchsubp2s, groups=ncellgrp, shuffle.colors=F, font.size=c(2,5),
                    show.legend=F, size=0.4)