# Set seed for reproducibility set.seed(1454944673L) library(knitr) if (!params$scratch){ message("render from scratch") unlink(params$cache,recursive = T) } opts_chunk[["set"]]( audodep = TRUE, cache = TRUE, cache.lazy = FALSE, cache.path = params$cache, error = TRUE, fig.height = 8L, fig.retina = 2L, fig.width = 8L, message = FALSE, tidy = TRUE, warning = FALSE )
library(ggplot2) library(cowplot) library(tidyverse) library(Seurat) library(rio) library(janitor) rows = readRDS(file.path(params$data_dir, params$rownames)) markers = readRDS(file.path(params$data_dir, params$markers)) theme_set(theme_light(base_size = 11)) theme_update(legend.position = "bottom")
We used seurat to dentify positive and negative markers of a single cluster (specified in ident.1), compared to all other cells. FindAllMarkers
automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells.
The results data frame has the following columns :
By default the test used is "wilcox" : Wilcoxon rank sum test (default). It tends to over-represent significant clusters and we can explore other test in the future depending on this results.
seurat = readRDS(file.path(params$data_dir, params$seurat_tsne))
tsne_label = FetchData(seurat, vars.all = c("ident", "tSNE_1", "tSNE_2")) %>% as.data.frame() %>% group_by(ident) %>% summarise(x=mean(tSNE_1), y=mean(tSNE_2)) pca_label = FetchData(seurat, vars.all = c("ident", "PC1", "PC2")) %>% as.data.frame() %>% mutate(ident = seurat@ident) %>% group_by(ident) %>% summarise(x=mean(PC1), y=mean(PC2))
markers %>% rownames_to_column("id") %>% left_join(rows, by = c("id" = "gene_id")) %>% write_csv(params$output_fn)
reduce <- function(X, Y, resolution=80){ resolution <- max(resolution, 1L) resolution <- min(resolution, sqrt(.Machine$integer.max)) resolution <- as.integer(resolution) # X and Y MUST be numeric. rangeX <- range(X) rangeY <- range(Y) binX <- (rangeX[2] - rangeX[1])/resolution xid <- (X - rangeX[1])/binX xid <- as.integer(xid) binY <- (rangeY[2] - rangeY[1])/resolution yid <- (Y - rangeY[1])/binY yid <- as.integer(yid) # Getting unique IDs, provided resolution^2 < .Machine$integer.max # We use fromLast=TRUE as the last points get plotted on top. id <- xid + yid * resolution return(id) }
All markers are in the markers file.
I have selected the top 12 to plot.
I have reduced the ammount of points to plot to get a quicker turn around on the time to render the document. It summarises the cells that overlap on the same space and plot the median expression value.
I plot the markers of one of the cluster to compare.
top10 = markers %>% group_by(cluster) %>% arrange(desc(avg_logFC)) %>% filter(p_val_adj<0.001) %>% top_n(n = 12, wt = avg_logFC) gene_data = FetchData(seurat, vars.all = c("tSNE_1", "tSNE_2", "ident", unique(top10$gene))) %>% mutate(id_group=reduce(tSNE_1, tSNE_2)) %>% gather(id, counts, -tSNE_1,-tSNE_2,-ident, -id_group) %>% left_join(rows, by = c("id" = "gene_id")) genes = filter(top10, cluster == 1) %>% .[["gene"]] %>% as.character() group_by(filter(gene_data, id %in% genes), id, ident, id_group, gene_name) %>% summarise(tSNE_1=mean(tSNE_1), tSNE_2=mean(tSNE_2), value=median(counts)) %>% ggplot(aes(tSNE_1, tSNE_2)) + geom_point(aes(color=value), alpha=0.8) + scale_color_gradient2(guide = FALSE, midpoint = 0, mid = "grey90", high = "#2c7fb8") + geom_text(data=tsne_label, aes(label=ident, x, y)) + facet_wrap(~gene_name)
lapply(as.character(unique(gene_data$ident)), function(c){ cat("\n\n### ", c, " \n\n") genes = filter(top10, cluster == c) %>% .[["gene"]] %>% as.character() if (length(genes)==0){ print("No significant markers.") return(NULL) } p = group_by(filter(gene_data, id %in% genes), id, ident, id_group, gene_name) %>% summarise(tSNE_1=mean(tSNE_1), tSNE_2=mean(tSNE_2), value=median(counts)) %>% ggplot(aes(tSNE_1, tSNE_2)) + geom_point(aes(color=value), alpha=0.8) + scale_color_gradient2(guide = FALSE, midpoint = 0, mid = "grey90", high = "#2c7fb8") + geom_text(data=tsne_label, aes(label=ident, x, y)) + facet_wrap(~gene_name) print(p) }) %>% invisible()
To show the co-expression of markers I calculate the average of expression of the cells in the cluster and then calculate the distance with (1-cor(ma))^2
and the method kendall and perform the clustering with the method ward.D2.
lapply(as.character(unique(gene_data$ident)), function(c){ cat("\n\n### ", c, " top 30 markers \n\n") genes = markers %>% filter(cluster==c) %>% arrange(desc(avg_logFC)) %>% filter(p_val_adj<0.001) %>% top_n(n = 40, wt = avg_logFC) %>% .[["gene"]] %>% as.character() if (length(genes)<2){ print("Not enough genes to plot.") return(NULL) } data = FetchData(seurat, vars.all = c("tSNE_1", "tSNE_2", "ident", genes)) %>% mutate(id_group=reduce(tSNE_1, tSNE_2)) %>% gather(id, counts, -tSNE_1,-tSNE_2,-ident, -id_group) %>% left_join(rows, by = c("id" = "gene_id")) ma = data %>% group_by(gene_name, ident) %>% # summarise(value=mean(counts>quantile(counts, .75))) %>% summarise(value=mean(counts)) %>% spread(ident, value) %>% as.data.frame() %>% column_to_rownames("gene_name") %>% as.matrix() ma = ma + rnorm(nrow(ma), 0.01, sd = 0.001) xclus = hclust(as.dist((1-cor(ma, method = "kendall"))^2), method = "ward.D2") yclus = hclust(as.dist((1-cor(t(ma), method = "kendall"))^2), method = "ward.D2") xlevel = xclus$labels[xclus$order] ylevel = yclus$labels[yclus$order] p = data %>% group_by(gene_name, ident) %>% summarise(value=mean(counts)) %>% ungroup() %>% mutate(ident = factor(ident, levels=xlevel), gene_name = factor(gene_name, levels = ylevel)) %>% ggplot(aes(ident, gene_name, fill=value)) + geom_tile() + theme(axis.text.x = element_text(angle=90, hjust = 1, vjust = 0.5)) + scale_fill_gradient2(mid = "grey90", high = "#2c7fb8", midpoint = 0) print(p) cat("\n\n") }) %>% invisible()
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.