# Jake Yeung
# make_clusters_from_topics_mixturemodel.R
# 2020-04-18
# DESCRIPTION
#
# Get clusters from mixture model
#
# FOR HELP
#
# Rscript make_clusters_from_topics_mixturemodel.R --help
#
# AUTHOR: Jake Yeung (j.yeung@hubrecht.eu)
# LAB: Quantitative Biology Lab (https://www.hubrecht.eu/research-groups/van-oudenaarden-group/)
# CREATED ON: 2020-04-18
# LAST CHANGE: see git log
# LICENSE: MIT License (see: http://opensource.org/licenses/MIT)
suppressPackageStartupMessages(library("argparse"))
library(mixtools)
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(Matrix)
library(hash)
library(igraph)
library(umap)
library(topicmodels)
library(scchicFuncs)
library(ggrepel)
library(forcats)
# # for annotating bin to gene
# if (args$species == "mouse"){
# library(TxDb.Mmusculus.UCSC.mm10.knownGene)
# library(org.Mm.eg.db)
# } else {
# print(paste("Species:", args$species, "not yet coded"))
# }
library(ChIPseeker)
# create parser object
parser <- ArgumentParser()
# specify our desired options
# by default ArgumentParser will add an help option
parser$add_argument('-infile', metavar='INFILE',
help='lda output .RData (expects out.lda)')
parser$add_argument('-outprefix', metavar='OUTPREFIX include dir',
help='Output prefix for output .RData and .pdf objects')
parser$add_argument('-annotfile', metavar='CSVFILE',
help='Tab-delimited file with columns cell and cluster which defines cells into each cluster')
# parser$add_argument('-topicskeep', metavar='List of numbers', nargs = "+",
# help='list of topics to keep (prefixes add later? e.g. -topicskeep 1 2 3 4)')
parser$add_argument('-mark', metavar='HistoneMarkName',
help='Name of histone mark')
parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
help="Print extra output [default]")
# get command line options, if help option encountered print help and exit,
# otherwise if options not found on command line then set defaults,
args <- parser$parse_args()
# print some progress messages to stderr if "quietly" wasn't requested
if ( args$verbose ) {
print("Arguments:")
print(args)
}
# Contants ----------------------------------------------------------------
jsep <- "" # separation between topic and number
# Get clusters from topics
dat.annot <- fread(args$annotfile)
topics.keep <- unique(dat.annot$cluster)
topics.keep <- topics.keep[!is.na(topics.keep)]
names(topics.keep) <- topics.keep
# topics.keep <- paste("topic", args$topicskeep, sep = jsep)
# topics.names <- topics.keep
# names(topics.keep) <- topics.names
# these settings alter the clustering and UMAP
jsettings <- umap.defaults
jsettings$n_neighbors <- 30
jsettings$min_dist <- 0.1
jsettings$random_state <- 123
cbPalette <- c("#696969", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#006400", "#FFB6C1", "#32CD32", "#0b1b7f", "#ff9f7d", "#eb9d01", "#7fbedf") # let's hope you don't have more clusters than cbPalettes... otherwise need to recycle these
# Load LDA outputs for single stains --------------------------------------------------------
jmark <- args$mark
inf.lda <- args$infile
assertthat::assert_that(file.exists(inf.lda))
bname <- ClipLast(basename(inf.lda), jsep = "\\.", jsep.out = ".")
outprefix <- args$outprefix
print(paste("Writing output files using this prefix:", outprefix))
# Load data ---------------------------------------------------------------
outf <- paste0(outprefix, ".RData")
outpdf <- paste0(outprefix, ".pdf")
load(inf.lda, v=T)
tm.result <- posterior(out.lda)
# dont separate with _
colnames(tm.result$topics) <- paste("topic", colnames(tm.result$topics), sep = jsep)
rownames(tm.result$terms) <- paste("topic", rownames(tm.result$terms), sep = jsep)
topics.mat <- tm.result$topics
dat.umap.long <- DoUmapAndLouvain(topics.mat, jsettings)
# left_join(., annots.dat) # in general case dont add annots
dat.umap.long <- dat.umap.long %>%
rowwise() %>%
mutate(plate = ClipLast(cell, jsep = "_"))
# add var
dat.impute.log <- log2(t(tm.result$topics %*% tm.result$terms))
(jchromos <- paste("chr", seq(19), sep = ""))
dat.var <- CalculateVarAll(dat.impute.log, jchromos)
print("Check datumaplong and datvar")
print(head(dat.umap.long))
print(head(dat.var))
dat.umap.long <- left_join(dat.umap.long, dat.var)
print("Check datumaplong again")
print(head(dat.umap.long))
dat.umap.long.merge.tmp <- left_join(dat.umap.long, dat.annot %>% dplyr::select(c(cell, cluster)))
dat.umap.long$mark <- jmark
m.louv <- ggplot(dat.umap.long, aes(x = umap1, y = umap2, color = louvain)) + geom_point() + theme_bw() + theme(aspect.ratio=1, panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
scale_color_manual(values = cbPalette) + facet_wrap(~plate) + ggtitle(jmark)
m.cluster <- ggplot(dat.umap.long.merge.tmp, aes(x = umap1, y = umap2, color = cluster)) + geom_point() + theme_bw() + theme(aspect.ratio=1, panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
scale_color_manual(values = cbPalette) + facet_wrap(~plate) + ggtitle(jmark)
m.var <- ggplot(dat.umap.long, aes(x = umap1, y = umap2, color = cell.var.within.sum.norm)) + geom_point() + theme_bw() +
theme(aspect.ratio=1, panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
scale_color_viridis_c(direction = -1) + facet_wrap(~plate) + ggtitle(jmark)
# Define topics ----------------------------------------------------------
# plot and make objects
pdf(outpdf, useDingbats = FALSE)
print(m.cluster)
print(m.louv)
print(m.var)
jthres <- 0.5
# label cells by topic
par(mfrow=c(1,1), mar=c(5.1, 4.1, 4.1, 2.1), mgp=c(3, 1, 0), las=0)
mm.celltype.lst <- lapply(topics.keep, function(jtopic){
print(jtopic)
cells.keep <- subset(dat.annot, cluster == jtopic)$cell
# tvec.raw <- sort(tm.result$topics[, jtopic])
# # transform
# tvec <- log(tvec.raw / (1 - tvec.raw))
# # xline <- quantile(tvec, )
# mm <- normalmixEM(x = tvec, lambda = c(0.9, 0.1), mu = c(-5, -1), sigma = c(2, 1), k = 2)
# (xline <- min(mm$x[which(mm$posterior[, 1] < jthres)]))
# xcells <- names(tvec)[which(tvec > xline)]
# print(paste(length(xcells), "/", length(tvec), "assigned to", jtopic))
# # get topic value for assigned cells
# tvec.raw.filt <- tvec.raw[xcells]
# (xline <- min(mm$x[which(mm$posterior[, 1] < jthres)]))
# # xline <- max(mm$x[indx.btwn][which(post.filt[, 2] > 0.5)])
# plot(density(tvec), main = paste(jtopic, jthres), xlab = "Log Odds [log(p / (1 - p))]")
# abline(v = xline, col = 'blue')
# plot.mixEM(mm, whichplots = 2, xlab2 = "Log Odds [log(p / (1 - p))]", main2 = paste(jtopic, jthres))
# abline(v = xline, col = 'blue')
# cells.keep <- xcells
m.check <- PlotXYWithColor(dat.umap.long %>% mutate(is.celltype = cell %in% cells.keep), xvar = "umap1", yvar = "umap2", cname = "is.celltype", jtitle = paste(jtopic, jthres), cont.color = FALSE, col.palette = cbPalette, use.ggrastr=FALSE)
print(m.check)
return(list(topic = jtopic, cells.keep = cells.keep))
# return(list(topic = jtopic, topic.weight = tvec.raw.filt, celltype = xcells, mm = mm, threshold = xline))
})
celltypes <- lapply(names(mm.celltype.lst), function(ctype){
data.frame(cluster = ctype, cell = mm.celltype.lst[[ctype]]$cells.keep,
topic = mm.celltype.lst[[ctype]]$topic,
stringsAsFactors = FALSE)
}) %>%
bind_rows()
# # handle replicates
# celltypes.dedup <- celltypes %>%
# group_by(cell) %>%
# filter(topic.weight == max(topic.weight))
#
# assertthat::assert_that(!all(duplicated(celltypes.dedup)))
celltypes.dedup <- celltypes
print("Check celltypes datumap celltypes")
print(head(celltypes))
print(head(dat.umap.long))
print(head(celltypes.dedup))
dat.merge <- left_join(dat.umap.long, celltypes.dedup)
dat.merge$plate <- sapply(dat.merge$cell, function(x) ClipLast(x, jsep = "_"))
# handle duplicates
# show final
m.final <- ggplot(dat.merge, aes(x = umap1, y = umap2, color = topic)) + geom_point() +
theme_bw() + theme(aspect.ratio=1, panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
scale_color_manual(values = cbPalette, na.value = "grey85")
print(m.final)
dev.off()
# save annotated dat to output
save(dat.merge, count.mat, mm.celltype.lst, out.lda, file = outf)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.