knitr::opts_chunk$set( collapse = TRUE, warning = FALSE, cache = TRUE, fig.width = 14, fig.width = 14, comment = "#>" )
library(scChIX) library(dplyr) library(ggplot2) library(data.table) library(Matrix) library(igraph) library(umap)
First we load the two pre-trained multinomial models (one for each mark) inferred from the single-incubated data.
The model is a probability of getting a read within a region relative to other regions.
This can be estimated from as simple as just fraction of reads in bin / total reads for each cluster.
Here we ran Latent Dirichlet Allocation (Package: topicmodels
) to estimate this bin-specific probability genome-wide.
# data(ObjsForFittingSubset) data(ModelFromSingleForFitting) # Loading objects: # dat.impute.repress.lst # dat.impute.active data(RawDblCountMatSubset) # Loading objects: # count.mat.dbl.subset
Now for every double-incubated cell (here we take a subset of 90 cells for speed), we perform model selection. The fitting infers which cluster-pair (one from H3K27me3, one from H3K9me3) best fits the observed double-incubated cut for each cell.
# bounds for fitting mixing fraction wlower <- 0 wupper <- 1 # method for fitting mixing fraction jmethod <- "Brent" count.mat.dbl <- count.mat.dbl.subset cell.count.raw.merged.lst <- as.list(as.data.frame(as.matrix(count.mat.dbl))) system.time( act.repress.coord.lst <- lapply(cell.count.raw.merged.lst, function(cell.count.raw.merged){ optim.out <- FitMixingWeight(cell.count.raw.merged = cell.count.raw.merged, dat.impute.repress.lst = dat.impute.repress.lst, dat.impute.active = dat.impute.active, w.init = 0.5, w.lower = wlower, w.upper = wupper, jmethod = "Brent") ll.mat <- GetLLMerged(optim.out$par, cell.count.raw.merged, dat.impute.repress.lst, dat.impute.active, return.mat = TRUE) return(list(ll.mat = ll.mat, w = optim.out$par)) }) )
We plot the fit for one cell as an example of the output. Here we show a double-incubated granulocyte cell (which we know from ground truth from FACS) is correctly predicted to come from a mixture of granulocyte-specific H3K27me3 distribution plus a granulocyte-specific H3K9me3 distribution.
(jcell.tmp <- names(act.repress.coord.lst)[[1]]) (jclst <- subset(annots.dat, cell == jcell.tmp)$celltype) jcell.tmp <- "BM-EtOH-MNctrl-K27m3-K9m3-p1_220" jclst <- "Granulocyte" # ground truth LL.mat.tmp <- act.repress.coord.lst[[jcell.tmp]]$ll.mat LL.mat.tmp <- data.frame(H3K27me3 = rownames(LL.mat.tmp), LL.mat.tmp, stringsAsFactors = FALSE) LL.long.tmp <- data.table::melt(LL.mat.tmp, id.vars = "H3K27me3", value.name = "LL", variable.name = "H3K9me3") k27me3.ordering.all <- c("topic16", "topic3", "topic9", "topic25", "topic11") k9me3.ordering.all <- c("topic12", "topic28", "topic1", "topic20") ctype.ordering.all <- c("Granulocytes", "Bcells", "NKcells") k27me3.clstr.labs <- list("topic16" = "Granulocytes", "topic3" = "Bcells", "topic9" = "Bcells", "topic25" = "NKcells", "topic11" = "NKcells") k9me3.clstr.labs <- list("topic12" = "Granulocytes", "topic28" = "Bcells", "topic1" = "NKcells", "topic20" = "NKcells") LL.long.tmp$H3K27me3 <- factor(LL.long.tmp$H3K27me3, levels = k27me3.ordering.all) LL.long.tmp$H3K9me3 <- factor(LL.long.tmp$H3K9me3, levels = k9me3.ordering.all) m <- ggplot(LL.long.tmp, aes(x = H3K27me3, y = H3K9me3, fill = LL)) + geom_tile() + theme_bw() + scale_fill_viridis_c() + ggtitle(paste("Ground truth:", jclst, "\nCell name: ", jcell.tmp)) + theme(aspect.ratio= 4/8, panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), legend.position = "bottom") print(m)
For each double-incubated cell, we select the best cluster-pair inferred from the model and summarize the assignments in a 2D grid.
# Summarize all cells ---------------------------------------------------- cell.vec <- names(act.repress.coord.lst) names(cell.vec) <- cell.vec coords.dbl <- lapply(cell.vec, function(jcell){ jfit <- act.repress.coord.lst[[jcell]] jweight <- act.repress.coord.lst[[jcell]]$w p.mat <- SoftMax(jfit$ll.mat) jcoord <- which(jfit$ll.mat == max(jfit$ll.mat), arr.ind = TRUE) jmax <- max(p.mat) jclstr.k27me3 <- rownames(p.mat)[[jcoord[[1]]]] jclstr.k9me3 <- colnames(p.mat)[[jcoord[[2]]]] if (grepl("_", jclstr.k27me3)){ jclstr.k27me3 <- strsplit(jclstr.k27me3, split = "_")[[1]][[2]] } if (grepl("_", jclstr.k9me3)){ jclstr.k9me3 <- strsplit(jclstr.k9me3, split = "_")[[1]][[2]] } out.dat <- data.frame(cell = jcell, clstr.k27me3 = jclstr.k27me3, clstr.k9me3 = jclstr.k9me3, lnprob = jmax, w = jweight, stringsAsFactors = FALSE) return(out.dat) }) %>% bind_rows() coords.dbl.annots <- left_join(coords.dbl, annots.dat) coords.dbl.annots$clstr.k27me3 <- factor(coords.dbl.annots$clstr.k27me3, levels = k27me3.ordering.all) coords.dbl.annots$clstr.k9me3 <- factor(coords.dbl.annots$clstr.k9me3, levels = k9me3.ordering.all) # annotate clusters coords.dbl.annots$clstr.k27me3.annot <- sapply(as.character(coords.dbl.annots$clstr.k27me3), function(clst){ k27me3.clstr.labs[[clst]] }) coords.dbl.annots$clstr.k9me3.annot <- sapply(as.character(coords.dbl.annots$clstr.k9me3), function(clst){ k9me3.clstr.labs[[clst]] }) cbPalette <- c("#696969", "#32CD32", "#56B4E9", "#FFB6C1", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#006400", "#FFB6C1", "#32CD32", "#0b1b7f", "#ff9f7d", "#eb9d01", "#7fbedf") m.grid <- ggplot(coords.dbl.annots, aes(x = clstr.k27me3.annot, y = clstr.k9me3.annot, color = celltype)) + geom_point(position = ggforce::position_jitternormal(sd_x = 0.08, sd_y = 0.08)) + theme_bw() + theme(aspect.ratio=1) + scale_color_manual(values = cbPalette) + xlab("H3K27me3 Cell Type Clusters") + ylab("H3K9me3 Cell Type Clusters") + ggtitle("Each dot is a double stained cell, colored by ground truth label.\nX-Y shows the cluster pair it is assigned") print(m.grid)
For each double-incubated cell, we now have a model of how H3K27me3 and H3K9me3 are mixed together to generate the observed double-incubated cut fragments. We use this cell-specific model to assign reads in each regions to either H3K27me3 or H3K9me3.
# Deconvolve double-incubated count matrix ------------------------------------------------------ all.cells <- coords.dbl$cell names(all.cells) <- all.cells col.i <- seq_len(ncol(count.mat.dbl)) names(col.i) <- colnames(count.mat.dbl) all.x.raw <- lapply(col.i, function(i) count.mat.dbl[, i]) # https://stackoverflow.com/questions/6819804/how-to-convert-a-matrix-to-a-list-of-column-vectors-in-r/6823557 all.mixweights <- coords.dbl$w names(all.mixweights) <- all.cells all.clstr.k27me3 <- as.character(coords.dbl$clstr.k27me3) all.clstr.k9me3 <- as.character(coords.dbl$clstr.k9me3) names(all.clstr.k27me3) <- all.cells names(all.clstr.k9me3) <- all.cells all.p.active <- lapply(all.clstr.k27me3, function(clstr.k27me3) dat.impute.active[clstr.k27me3, ]) all.p.repress <- lapply(all.clstr.k9me3, function(clstr.k9me3) dat.impute.repress.lst[[clstr.k9me3]]) jnames.all <- lapply(X = list(all.x.raw, all.mixweights, all.p.active, all.p.repress), FUN = names) assertthat::assert_that(all(sapply(jnames.all, identical, jnames.all[[1]]))) # assertthat::assert_that(all(names(all.x.raw) == names(all.mixweights))) system.time( x.raw.unmixed <- lapply(all.cells, function(jcell){ # print(jcell) return(UnmixRawCounts(x.raw = all.x.raw[[jcell]], mixweight = all.mixweights[[jcell]], p.active = all.p.active[[jcell]], p.repress = all.p.repress[[jcell]], random.seed = 0)) }) )
We take the H3K27me3 cuts deconvolved from double-incubated cells and project them onto the H3K27me3 manifold previously learned from single-incubated data. And similarly we project H3K9me3 cuts onto the H3K9me3 manifold. We project using the Latent Dirichet Allocation framework, which takes a vector of discrete counts and infers the cell-to-topic weights (which tells you where the new cells will be located in the UMAP), while freezing the topic-to-region weights (which have already been learned from the single-incubated data).
We project only 10 cells onto the manifold to show how its done without having to wait too long to finish while still running on a single core.
data(H3K27me3_LdaOutputs) # out.lda.k27me3 data(H3K9me3_LdaOutputs) # out.lda.k9me3 rnames <- rownames(count.mat.dbl) x.k27me3.mat <- as.data.frame(lapply(x.raw.unmixed, function(outlst) return(outlst$x.raw.active)), row.names = rnames) x.k9me3.mat <- as.data.frame(lapply(x.raw.unmixed, function(outlst) return(outlst$x.raw.repress)), row.names = rnames) set.seed(0) nsubset <- 10 # faster rand.i1 <- sample(seq(ncol(x.k27me3.mat)), size = nsubset) rand.i2 <- sample(seq(ncol(x.k9me3.mat)), size = nsubset) print("Projecting to a few unmixed cells to K27me3 manifold") system.time( out.lda.predict.k27me3 <- topicmodels::posterior(out.lda.k27me3, t(as.matrix(x.k27me3.mat[, rand.i1]))) ) print("Projecting to a few unmixed cells to K9me3 manifold") system.time( out.lda.predict.k9me3 <- topicmodels::posterior(out.lda.k27me3, t(as.matrix(x.k27me3.mat[, rand.i2]))) )
We can visualize both UMAPs together now. We link the two UMAPs together with the deconvoved double-incubated cells as connections between the two UMAPs.
jsettings <- umap::umap.defaults jsettings$n_neighbors <- 30 jsettings$min_dist <- 0.1 jsettings$random_state <- 123 jmarks <- c("K27m3", "K9m3") names(jmarks) <- jmarks data(H3K27me3_ProjectionOutput) # out.proj.K27m3 data(H3K9me3_ProjectionOutput) # out.proj.K9m3 out.proj.lst <- list("K27m3" = out.proj.K27m3, "K9m3" = out.proj.K9m3) dat.umap.merge <- lapply(jmarks, function(jmark){ out.proj.mark <- out.proj.lst[[jmark]] tm.result <- topicmodels::posterior(out.proj.mark$out.lda) topics.mat.orig <- tm.result$topics topics.mat.proj <- out.proj.mark$out.lda.predict$topics umap.out <- umap::umap(topics.mat.orig, config = jsettings) rownames(umap.out$layout) <- rownames(topics.mat.orig) dat.umap <- data.frame(cell = rownames(umap.out$layout), umap1 = umap.out$layout[, 1], umap2 = umap.out$layout[, 2], stringsAsFactors = FALSE) umap.proj <- predict(umap.out, topics.mat.proj) dat.umap.proj <- data.frame(cell = rownames(umap.proj), umap1 = umap.proj[, 1], umap2 = umap.proj[, 2], stringsAsFactors = FALSE) dat.umap.merge <- rbind(dat.umap %>% mutate(cond = "single"), dat.umap.proj %>% mutate(cond = "double")) %>% rowwise() %>% mutate(plate = ClipLast(cell, jsep = "_", jsep.out = "_")) %>% left_join(., annots.dat) dat.umap.merge$mark <- jmark return(dat.umap.merge) }) # merge the single umaps into one, seaprate by umap2 dat.merge.singles <- dat.umap.merge %>% bind_rows() %>% group_by(mark) %>% mutate(umap1 = scale(umap1, center = TRUE, scale = TRUE), umap2 = scale(umap2, center = TRUE, scale = TRUE)) yrange <- diff(range(dat.merge.singles$umap2)) / 1.5 xrange <- diff(range(dat.merge.singles$umap1)) / 1.5 dat.merge.singles <- dat.merge.singles %>% rowwise() %>% mutate(umap2.shift = umap2, umap1.shift = ifelse(mark == jmarks[[1]], umap1 + xrange, umap1 - xrange))
m <- ggplot(dat.merge.singles, aes(x = -1 * umap1.shift, y = umap2.shift, color = celltype, shape = mark, group = cell)) + geom_point() + geom_path(size = 0.1, alpha = 0.1) + theme_bw() + theme(aspect.ratio=0.5, panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "bottom") + scale_color_manual(values = cbPalette) + geom_vline(xintercept = 0, linetype = "dotted") + ggtitle(paste( "Linked map of H3K27me3 (left) and H3K9me3 (right)")) + xlab("UMAP1 (shifted)") + ylab("UMAP2") print(m)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.