library(data.table) library(NMF) library(Matrix) library(ggplot2) library(ggforce) library(RColorBrewer) library(gridExtra)
sparseM <- readRDS("../data/conradV3/merged_mouse_sparseM_V3.rds") sparseM <- t(sparseM) # No mutants # sparseM <- sparseM[grep("WT|SP|mj",rownames(sparseM)),] quickTabulate <- function(data.matrix, sparce.matrix){ histo_numers <- matrix(c(0:max(sparce.matrix), rep(0, max(sparce.matrix)+1)), ncol = 2) histo_numers[1:max(sparce.matrix)+1, 2] <- tabulate(data.matrix) histo_numers[1, 2] <- sum(sparce.matrix == 0) return(histo_numers) } hist_table <- data.table(quickTabulate(as.matrix(sparseM), sparseM)) invisible(hist_table[, Value:= as.integer(V1)]) invisible(hist_table[, Observed_Transcripts:= as.integer(V2)]) invisible(hist_table[, Percent:= Value/sum(Value)]) ggplot(hist_table, aes(Observed_Transcripts, Percent)) + geom_bar(stat = "identity") + xlab("Read count for each cell,transcript") + ylab("%") + xlim(-1,20)
cell_sums <- data.table(library_size = rowSums(sparseM), genes=rowSums(sparseM > 0), barcode=rownames(sparseM), experiment=gsub("[A-Z]+\\.","",rownames(sparseM))) cell_sums[order(library_size)] setorder(cell_sums, experiment, -library_size) cell_sums[order(-library_size)] experiments <- unique(gsub("[A-Z]+\\.","",rownames(sparseM))) cell_sums_dt <- data.table(table(cell_sums$library_size)) cell_sums_dt[,V1:=as.integer(V1)] ggplot(cell_sums_dt, aes(V1, N)) + geom_point() + ylab("Number of Cells") + xlab("Library Size (Total observed mRNA per cell)") + xlim(0,1000) # compare to total UMI ggplot(cell_sums, aes(library_size, genes)) + geom_point(alpha=0.2) + scale_y_log10() + scale_x_log10() + geom_abline(intercept=0,slope=1, colour="red") cell_sums_dt <- data.table(table(cell_sums$genes)) cell_sums_dt[,V1:=as.integer(V1)] ggplot(cell_sums_dt, aes(V1, N)) + geom_point() + ylab("Number of Cells") + xlab("Number of genes expressed per cell") + xlim(0,1000)
cell_sums$QC_fail <- FALSE ## Library Size filtering cell_sums[, library_threshold := mean(log(library_size)) - sd(log(library_size)), by=experiment] ggplot(cell_sums[library_size>50], aes(seq_along(barcode), library_size, colour=log(library_size) < library_threshold)) + geom_point(size=0.5) + scale_y_log10() + theme(legend.position = "bottom") + facet_wrap(~experiment, scales="free_x") + geom_hline(yintercept = 200) ## Number of genes expressed filtering cell_sums[, genes_threshold := mean(log(genes)) - sd(log(genes)), by=experiment] ggplot(cell_sums[library_size>50][order(-genes)], aes(seq_along(barcode), genes, colour=log(genes) < genes_threshold)) + geom_point() + scale_y_log10() + theme(legend.position = "bottom") + facet_wrap(~experiment, scales="free_x") + geom_hline(yintercept = 100) ## Apply Filters cell_sums[log(library_size) < library_threshold | library_size < 200 | log(genes) < genes_threshold | genes < 100, QC_fail := TRUE, by=experiment] ggplot(cell_sums[library_size>50], aes(seq_along(barcode), library_size, colour=QC_fail)) + geom_point(size=0.5) + scale_y_log10() + facet_wrap(~experiment, scales="free_x") cell_sums[,.(percent_fail=round(100*sum(QC_fail)/.N) ,QC_pass=sum(!QC_fail),QC_fail=sum(QC_fail)),by=c("experiment")][order(-percent_fail)] cell_sums[,.N,by=QC_fail] cell_subset <- cell_sums[QC_fail==FALSE]$barcode
rownames_cache <- rownames(sparseM) #rownames(sparseM) <- gsub("[A-Z]+\\.","",rownames_cache) sparseM <- sparseM[cell_subset,] # How expressed is Prdm9 table(sparseM[,"Prdm9"]) table(sparseM[,"Spo11"]) gene_sums <- data.table(umi = colSums(sparseM), cells=colSums(sparseM > 0), gene=colnames(sparseM)) setorder(gene_sums, -umi) gene_sums[,percent_of_cells := signif(cells/nrow(sparseM), 3)] plot(log(sort(gene_sums$umi))) plot(log(gene_sums$umi)) # Highly expressed genes head(gene_sums[order(-umi)], 20) # Widely expressed genes head(gene_sums[order(-cells)], 20) head(gene_sums[order(umi)], 20) head(gene_sums[order(cells)], 20) # Summary Table gene_sums_dt <- data.table(table(gene_sums$umi)) gene_sums_dt[,V1 := as.integer(V1)] ggplot(gene_sums_dt, aes(V1, N)) + ylab("Number of Genes") + xlab("Total observed mRNA per gene")+ geom_point() + scale_x_log10() # compare to total UMI ggplot(gene_sums, aes(umi, cells)) + geom_point(alpha=0.2) + scale_y_log10() + scale_x_log10() + geom_abline(intercept=0,slope=1, colour="red") total_cells_dt <- data.table(table(gene_sums$cells)) total_cells_dt[,V1:=as.integer(V1)] ggplot(total_cells_dt, aes(V1, N)) + ylab("Number of Genes") + xlab("Number of cells with non-zero expression")+ geom_point() + scale_x_log10() nrow(gene_sums) nrow(gene_sums[umi>5]) nrow(gene_sums[cells>5]) nrow(gene_sums[umi>5 & cells>5]) gene_subset <- gene_sums[umi>5 & cells>5][order(-umi)]$gene
I also downloaded a batch of retinal single cell data from Evan Macosko and Steve McCarroll's Drop-Seq paper for comparison as well as some bipolar retinal neurons.
```{bash, eval=FALSE}
Summarise McCarrol DGE files & bipolar retinal ```r dge.files <- list.files(pattern="*digital_expression\\.txt.gz", path="../data") summary <- data.table() for (file in dge.files){ dge <- fread(paste0("zcat < ../data/", file)) dge$gene <- NULL new_summary <- data.table( CELL_BARCODE = names(dge), NUM_GENES = apply(dge, 2, function(x) sum(x > 0)), NUM_TRANSCRIPTS = colSums(dge)) file <- gsub(".digital_expression.txt", "_gene_exon.dge.summary.txt", file) file <- gsub(".gz", "", file) write.csv(new_summary[order(-NUM_TRANSCRIPTS)], file = paste0("../data/", file), row.names=FALSE) }
Summarise 10X genomics PBMC file
library(Seurat) pbmc.data <- Read10X("../data/digital_gene_expression/10X_genomics/pbmc/hg19/") # create summary dge <- as.matrix(pbmc.data) new_summary <- data.table( CELL_BARCODE = colnames(dge), NUM_GENES = apply(dge, 2, function(x) sum(x > 0)), NUM_TRANSCRIPTS = colSums(dge)) write.csv(new_summary[order(-NUM_TRANSCRIPTS)], file = paste0("../data/digital_gene_expression/10X_genomics/pbmc/PBMC_10X_gene_exon.dge.summary.txt"), row.names=FALSE) hek.data <- Read10X("../data/digital_gene_expression/10X_genomics/293t/hg19/") # create summary dge <- as.matrix(hek.data) new_summary <- data.table( CELL_BARCODE = colnames(dge), NUM_GENES = apply(dge, 2, function(x) sum(x > 0)), NUM_TRANSCRIPTS = colSums(dge)) write.csv(new_summary[order(-NUM_TRANSCRIPTS)], file = paste0("../data/digital_gene_expression/10X_genomics/293t/293t_10X_gene_exon.dge.summary.txt"), row.names=FALSE) jurkat.data <- Read10X("../data/digital_gene_expression/10X_genomics/jurkat/hg19/") # create summary dge <- as.matrix(jurkat.data) new_summary <- data.table( CELL_BARCODE = colnames(dge), NUM_GENES = apply(dge, 2, function(x) sum(x > 0)), NUM_TRANSCRIPTS = colSums(dge)) write.csv(new_summary[order(-NUM_TRANSCRIPTS)], file = paste0("../data/digital_gene_expression/10X_genomics/jurkat/jurkat_10X_gene_exon.dge.summary.txt"), row.names=FALSE) theme_set(theme_gray())
stopifnot(digest::digest(sparseM[cell_subset, gene_subset]) == "bad0b808593dc87b14dc71cdc4cc140f") saveRDS(sparseM[cell_subset, gene_subset], "../data/conradV3/merged_mouse_QC_V3.rds") cell_sums <- cell_sums[order(-library_size)] # Cells per experiment cell_sums[, .N, by=experiment][order(-N)] cell_sums[, .(average_library_size = sum(library_size)/.N), by=experiment][order(-average_library_size)] line_type <- scale_linetype_manual(values = c(rep("solid", 13), rep("dashed", 13))) line_color <- scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2),"black","grey")) cell_sums[, cumulative_reads := cumsum(library_size), by=experiment] cell_sums[, cell_index := seq_len(.N), by=experiment] ggplot(cell_sums, aes(cell_index, cumulative_reads, colour = experiment, linetype=experiment)) + geom_line() + line_type + line_color + theme(legend.position="bottom") + facet_zoom( x = cell_index < 1500, y = cumulative_reads < 1.5e6) # # summary[, cumulative_fraction := cumsum(library_size) / sum(NUM_TRANSCRIPTS), by=experiment] # # invisible(summary[, cell_quantile := 1:length(CELL_BARCODE) / length(CELL_BARCODE), by=batch_merge]) # # # line_color <- scale_linetype_manual(values = c(rep("solid", 12), rep("dashed", 12))) # line_type <- scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2))) # # # compare cumulative reads and cell numbers # ggplot(summary, aes(cell_index, cumulative_reads, colour = batch_merge, linetype = batch_merge)) + # geom_line() + # line_color + line_type + # theme(legend.position="bottom") + # facet_zoom( x = cell_index < 1500, y = cumulative_reads < 1.5e6) # # # normalised CDF # ggplot(summary[batch!="out"], aes(cell_quantile, cumulative_fraction, colour = batch_merge, linetype = batch_merge)) + # geom_line() + # line_color + line_type + # theme(legend.position="bottom") # # # compare cumulative reads and cell numbers # ggplot(summary[batch_merge %in% c("293t_10X","MWT3330","MLH3KO","GSM1626798_P14Retina_6","FACS")], aes(cell_index, cumulative_reads, colour = batch_merge, linetype = batch_merge)) + # geom_line() + # line_color + line_type + # theme(legend.position="bottom") + # facet_zoom( x = cell_index < 1500, y = cumulative_reads < 1.5e6) # # # normalised CDF # ggplot(summary[batch_merge %in% c("293t_10X","MWT3330","MLH3KO","GSM1626798_P14Retina_6","FACS")], aes(cell_quantile, cumulative_fraction, colour = batch_merge, linetype = batch_merge)) + # geom_line() + # line_color + line_type + # theme(legend.position="bottom")
tmp <- seq(0,100,0.1) plot(tmp, sqrt(tmp), type="l", ylim=c(-1,10)) lines(tmp, asinh(tmp), col="red") lines(tmp, log(tmp), col="blue") abline(h=0) abline(0,1) # post lib size correction tmp <- sample(dge, 1e6) hist(tmp, breaks = 50000, ylim = c(0, 1e3), xlim=c(0,0.15), xlab="Expression", main="x") hist(sqrt(tmp), breaks = 1000, ylim = c(0, 1e3), xlim=c(0,0.4), xlab="Expression", main="sqrt(x)") hist(asinh(tmp), breaks = 50000, ylim = c(0, 1e3), xlim=c(0,0.1), xlab="Expression", main="asinh(x)") hist(10000*tmp, breaks = 50000, ylim = c(0, 1e3), xlim=c(0,1500), xlab="Expression", main="10000*x") hist(10000*asinh(tmp), breaks = 50000, ylim = c(0, 1e3), xlim=c(0,1000), xlab="Expression", main="10000*asinh(x)") hist(asinh(10000*tmp), breaks = 500, ylim = c(0, 1e3), xlim=c(0,10), xlab="Expression", main="asinh(10000*x)") hist(log(10000*tmp+1), breaks = 500, ylim = c(0, 1e3), xlim=c(0,10), xlab="Expression", main="log(10000*x+1)") hist(sqrt(10000*tmp), breaks = 5000, ylim = c(0, 1e3), xlim=c(0,50), xlab="Expression", main="sqrt(10000*x)")
# load raw data for preprocessing raw_data <- readRDS("merged_mouse_QC_V3.rds") raw_data <- t(raw_data) data <- normaliseDGE(raw_data, center = FALSE, scale = TRUE, threshold = 10, min_library_size = 200, gene_subset = (2/3)) library_size <- Matrix::colSums(raw_data[,rownames(data)]) # Issue warning if data changes stopifnot(digest::digest(data) == "a5a74d6b89e2835fad86f8a4ba342ca5") saveRDS(list(dge=data, library_size=library_size), "merged_mouse_normalised_V3.rds") # save a cache for easy reopening stopifnot(tools::md5sum("merged_mouse_normalised_V3.rds") == "a9556d493f27e94978d6a4ef1ae2c00a")
t-SNE enables the reduction of dimensionality of the digital gene expression matrix to 2 which enables visualisation of the component scores on a common plot.
Cells with a higher library size are found towards the edges of the plot. Some "clusters" in the t-SNE plot are enriched for cells from a specific experiment. For example MLH3 KO cells are mostly in a cluster at the bottom with a few in the main cluster on the far left. SPCI (early) facs sorted cells are on the left size and SPD (late) facs sorted cells are on the right. We will see from inspecting the components loadings and scores that this is a more general pattern with cells early in the process of spermatogenesis on the left and gradually 'moving' around the outside of the plot clockwise towards the bottom right.
However there is an issue here as the experimental conditions are confounnded by batch so it's hard to disentangle batch effect from biological effect.
data <- readRDS("merged_mouse_normalised_V3.rds") library_size <- data$library_size data <- data$dge # PCA #devtools::install_github("gabraham/flashpca/flashpcaR") library(flashpcaR) pcaresult <- flashpca(as.matrix(data), ndim=250, stan="center", verbose=T, seed=42) # tSNE set.seed(42) tsne_data <- Rtsne::Rtsne(pcaresult$projection, verbose=TRUE, pca=FALSE, perplexity = 30, max_iter=2000) set.seed(42) tsne_data <- Rtsne::Rtsne(pcaresult$projection, verbose=TRUE, pca=FALSE, perplexity = 100, max_iter=1000) save(pcaresult, file="merged_mouse_pca_full.rds") save(tsne_data, file="merged_mouse_tsne_full.rds") #load("../data/conradV3/merged_mouse_pca_full.rds")
data <- readRDS("../data/conradV3/merged_mouse_normalised_V3.rds") library_size <- data$library_size data <- data$dge load("../data/conradV3/merged_mouse_tsne_full.rds") #cell_data$genes_detected <- data.table(genes_detected, cell=names(genes_detected))[order(cell)]$genes_detected cell_data <- data.table(tsne_data$Y) setnames(cell_data, c("Tsne1","Tsne2")) cell_data$cell <- rownames(data) cell_data$experiment <- gsub("[A-Z]+\\.","",rownames(data)) cell_data$group <- gsub("_.*","",cell_data$experiment) cell_data$library_size <- library_size # [grep("WT|SP|mj",rownames(data))] marker_genes <- c("Insl3", "Stra8", "Sycp2","Piwil1","Spata31", "Prss51", "Tnp1", "Prm1", "Prm2", "mt-Rnr2","Prdm9","Prss50","Dcn","Cst9","C1qb","Rpl41") other_genes <- c("Tex101","Hmgb2","Ddx39","Calm2","Hnrnpa2b1","Tpr","Pcgf5","Sycp2","Piwil1","Tnp1","Tnp2","Hmgb4","Tex37","Spata3","Klk1b8","Paqr5") # for GFP like plot, "Mtl5","H2afb1", mito_genes <- grep("mt-",colnames(data), value=T) expression <- data.table(as.matrix(data[,c(marker_genes, mito_genes, other_genes)])) # cell_data <- cbind(cell_data, expression) ggplot(cell_data, aes(Tsne1, Tsne2, color=log(library_size))) + geom_point() + scale_colour_distiller(palette="YlOrRd", direction=1) + theme(legend.position = "bottom") + ggtitle("t-SNE - library size") ggplot(cell_data, aes(Tsne1, Tsne2, color=experiment)) + geom_point() + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") + scale_color_manual(values = c(rep(brewer.pal(12,"Paired"),2),"black","grey")) #scale_colour_brewer(palette="Paired") ggplot(cell_data, aes(Tsne1, Tsne2, color=group)) + geom_point() + scale_color_manual(values = brewer.pal(11,"Paired")) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") simplify <- theme(legend.position = "none", axis.title.x=element_blank(), axis.title.y=element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) print_pca <- function(i){ ggplot(cell_data, aes(Tsne1, Tsne2, color=get(i))) + geom_point(size=0.2) + scale_colour_distiller(palette="YlOrRd", direction=1) + theme(legend.position = "bottom") + ggtitle(paste0(i," - t-SNE")) + simplify } grid.arrange(grobs=list(print_pca(marker_genes[1]), print_pca(marker_genes[2]), print_pca(marker_genes[3]), print_pca(marker_genes[4]), print_pca(marker_genes[5]), print_pca(marker_genes[6]), print_pca(marker_genes[7]), print_pca(marker_genes[8]), print_pca(marker_genes[9]), print_pca(marker_genes[10]), print_pca(marker_genes[11]), print_pca(marker_genes[12]), print_pca(marker_genes[13]), print_pca(marker_genes[14]), print_pca(marker_genes[15]), print_pca(marker_genes[16])), nrow=4, heights = c(1,1,1,1))
Cells in the amorphous group tend to have lower library sizes and high mitochondrial gene expression than normal cells - suggesting poor quality cells.
# Define circle to tagg odd cells cell_data[,odd:=FALSE] cell_data[((Tsne1-(-10))^2 + (Tsne2-(+20))^2) < 23^2, odd:= TRUE] # Label Odd Cells ggplot(cell_data, aes(Tsne1, Tsne2, color=odd)) + geom_point(size=0.5) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") cell_data[,.N,by=odd] cell_data[odd==FALSE,.N,by=group][order(-N)] # Proportion of Odd by experiment ggplot(cell_data[,.N,by=c("experiment","odd")], aes(experiment,N, fill=odd)) + geom_col(position="fill") + geom_hline(yintercept = 0.5, linetype="dashed") + theme(axis.text.x=element_text(angle=40,hjust=1,vjust=1,size=15)) ggplot(cell_data[,.N,by=c("experiment","odd")], aes(experiment,N, fill=odd)) + geom_col(position="stack") + theme(axis.text.x=element_text(angle=40,hjust=1,vjust=1,size=15))
cell_data[,last_bc_nt:=substr(cell, 12, 12)] # # if some barcodes different length! # index <- regexpr("[A-Z]\\.", cell_data$cell) # cell_data$last_bc_nt <- substring(cell_data$cell, index, index) # cell_data$last_bc_nt_2 <- substring(cell_data$cell, index-1, index-1) # Where are corrected cells? ggplot(cell_data, aes(Tsne1, Tsne2, color=last_bc_nt=="N")) + geom_point(size=0.3) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") ggplot(cell_data, aes(Tsne1, Tsne2, color=last_bc_nt)) + geom_point(size=0.3) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") # N occours less in odd cells ggplot(cell_data[,.N,by=c("odd","last_bc_nt")], aes(last_bc_nt, N, fill=odd)) + geom_col(position="fill") tmp <- cell_data[,.N,by=c("odd","last_bc_nt")] tmp[,last_bc_nt := factor(last_bc_nt, levels=c("A","G","C","T","N"))] ggplot(tmp, aes(odd, N, fill=last_bc_nt)) + geom_col(position="fill") # some experiments don;t have barcodes with N tmp <- cell_data[,.N,by=c("experiment","last_bc_nt")] tmp[,last_bc_nt := factor(last_bc_nt, levels=c("A","G","C","T","N"))] ggplot(tmp, aes(experiment, N, fill=last_bc_nt)) + geom_col(position="fill") + theme(axis.text.x=element_text(angle=40,hjust=1,vjust=1,size=15)) # ggplot(cell_data[,.N,by=c("experiment","corrected")], aes(experiment,N, fill=corrected)) + # geom_col(position="fill") + # theme(axis.text.x=element_text(angle=40,hjust=1,vjust=1,size=15)) # second to last barcode seems fine cell_data[,bc11:=substr(cell, 11, 11)] tmp <- cell_data[,.N,by=c("experiment","bc11")] tmp[,bc11 := factor(bc11, levels=c("A","G","C","T","N"))] ggplot(tmp, aes(experiment, N, fill=bc11)) + geom_col(position="fill") + theme(axis.text.x=element_text(angle=40,hjust=1,vjust=1,size=15))
# # % ribosomal gene expression # raw_data <- readRDS("../data/conradV2/merged_mouse_QC_counts_2.rds") # # cell_data$rnr2_percent <- raw_data[, "mt-Rnr2"] / library_size # # cap extream values # cell_data[rnr2_percent>0.05]$rnr2_percent <- 0.05 # max_scores_bycell <- apply(SDAresults$scores, 1, max) # max_scores_bycell <- data.table(max_scores_bycell, cell=names(max_scores_bycell)) # cell_data <- merge(cell_data, max_scores_bycell, all.x = TRUE) # diff_expression <- data.table(odd=apply(data[cell_data[max_scores_bycell>20 & group=="WT"]$cell,], 2, median), normal=apply(data[cell_data[max_scores_bycell<20 & group=="WT"]$cell,], 2, median), gene=colnames(data)) #ggplot(cell_data[!is.na(max_scores_bycell)], aes(Tex101, max_scores_bycell, colour=experiment)) + geom_point() + scale_x_log10() + facet_wrap(~experiment) #cell_data[, is_mlh3:=FALSE] #cell_data[group %in% c("Mlh","MLHKO"), is_mlh3:=TRUE] # & is_mlh3==FALSE, & is_mlh3==FALSE # which genes are differnetially expressed in the odd group # diff_expression <- data.table(odd=colMeans(data[cell_data[odd==TRUE & !group %in% c("Hormad","Mlh3")]$cell,]), normal=colMeans(data[cell_data[odd==FALSE & !group %in% c("Hormad","Mlh3")]$cell,]), gene=colnames(data)) # diff_expression[,ratio := (odd+0.1)/(normal+0.1)] # head(diff_expression[order(-ratio)], 20) # head(diff_expression[order(ratio)], 20) hist(cell_data$mt_geneset, breaks=500) cell_data$mt_geneset <- sqrt(rowSums(sparseM[cell_data$cell,mt_genes[-1]]) / rowSums(sparseM[cell_data$cell,])) # create mito-marker genes mt_genes <- grep("mt-",colnames(data), value=T) cell_data[, mt_geneset := Reduce(`+`, .SD), .SDcols = mt_genes ] cell_data[, mt_geneset := `mt-Rnr2` ] # Desnity comparisons ggplot(cell_data, aes(library_size, colour=odd)) + geom_density() + scale_x_log10() + facet_wrap(~experiment) ggplot(cell_data, aes(mt_geneset, colour=odd)) + geom_density() + scale_x_log10() + facet_wrap(~experiment) # Sina Plots - MT & LIB ggplot(cell_data, aes(odd, mt_geneset, colour=odd)) + geom_sina() + scale_y_log10() + facet_wrap(~group) ggplot(cell_data, aes(odd, library_size, colour=odd)) + geom_sina() + scale_y_log10() + facet_wrap(~group) #ggplot(cell_data[group %in% c("WT","Mlh")], aes(mt_geneset, group=experiment, colour=group)) + geom_density() + scale_x_log10() #ggplot(cell_data[group %in% c("WT","Mlh")], aes(mt_geneset, group=experiment, colour=group)) + geom_density() + scale_x_log10() + scale_colour_brewer(palette="Paired") + facet_wrap(~odd, ncol = 1) # Scatter plot #ggplot(cell_data, aes(mt_geneset, library_size, colour=odd)) + geom_point(alpha=0.2) + scale_y_log10() + scale_colour_brewer(palette="Paired") + facet_wrap(~group) ggplot(cell_data, aes(Tsne1, Tsne2, color=mt_geneset)) + geom_point() + theme(legend.position = "bottom") + scale_colour_distiller(palette="YlOrRd", direction=1) + ggtitle("t-SNE - experiment") hist(cell_data$mt_geneset, breaks=1000) plot(cell_data$mt_geneset, cell_data$`mt-Rnr2`) ggplot(cell_data, aes(Tsne1, Tsne2, color=mt_geneset)) + geom_point() + theme(legend.position = "bottom") + scale_colour_distiller(palette="YlOrRd", direction=1) + ggtitle("t-SNE - experiment") ggplot(cell_data, aes(Tsne1, Tsne2, color=mt_geneset<2)) + geom_point(size=0.01) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") ggplot(cell_data[mt_geneset<2], aes(Tsne1, Tsne2)) + geom_point(size=0.01) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment")
This group also has a mixture of late and early gene expressing cells. Check for doublets+ by co-expression of early and late genes (like mouse vs human mRNA)
cell_data[,early_geneset := Tex101+Hmgb2+Ddx39+Calm2+Hnrnpa2b1+Tpr+Pcgf5+Sycp2+Piwil1] cell_data[,late_geneset := Prm1+Tnp1+Tnp2+Prm2+Hmgb4+Tex37+Spata3+Klk1b8+Paqr5] #ggplot(cell_data, aes(Tsne1, Tsne2, color=rnr2_percent>0.02 | (early_geneset>2.5 & late_geneset>2.5))) + # geom_point() + # theme(legend.position = "bottom") + # ggtitle("t-SNE - experiment") # Early vs Late Scatter ggplot(cell_data, aes(early_geneset, late_geneset, color=odd)) + geom_point(alpha=0.1) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") + facet_grid(~odd, scales = "free") # cap extreme values cell_data[early_geneset > quantile(early_geneset, 0.9), early_geneset:= quantile(early_geneset, 0.9)] cell_data[late_geneset > quantile(late_geneset, 0.9), late_geneset:= quantile(late_geneset, 0.9)] # normalise to max=1 cell_data[,early_geneset_scale := early_geneset / max(early_geneset)] cell_data[,late_geneset_scale := late_geneset / max(late_geneset)] # GFP like plot ggplot(cell_data, aes(Tsne1, Tsne2)) + geom_point(color=rgb(red=cell_data$late_geneset_scale, green=0, blue=cell_data$early_geneset_scale), size=0.3) + theme(legend.position = "none") + ggtitle("t-SNE - experiment") + theme_dark() + theme(panel.background = element_rect(fill = "grey20", colour = NA)) ggplot(cell_data, aes(Sycp2, Prm2, color=odd)) + geom_point(alpha=0.1) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") + facet_grid(~odd) ggplot(cell_data, aes(Piwil1, Prm2, color=odd)) + geom_point(alpha=0.1) + theme(legend.position = "bottom") + ggtitle("t-SNE - experiment") + facet_grid(~odd)
stopifnot(digest::digest(cell_data[odd==FALSE & mt_geneset<2]$cell) == "94a4bdfa1dfdbec834ba2c4f0d2f43c1") cell_subset2 <- cell_data[odd==FALSE & mt_geneset<2]$cell raw_data <- readRDS("../data/conradV3/merged_mouse_QC_V3.rds") str(raw_data[cell_subset2, ]) saveRDS(raw_data[cell_subset2, ], "../data/conradV3/merged-mouse_counts_tsne-QC_low-mt.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.