knitr::read_chunk("../../analysis/chunks.R")
library(ggplot2) library(ggrastr) library(dplyr) library(dropestr) library(dropEstAnalysis) library(Matrix) theme_set(theme_base) set.seed(42) kDataPath <- '../../data/' kOutputPath <- '../../output/' kDropEstData <- paste0(kDataPath, 'dropest/10x/pbmc8k/') kAnnotationData <- paste0(kDataPath, 'annotation/') kEstFolder <- paste0(kDropEstData, 'est_11_11_umi_quality/') k10xFolder <- paste0(kDropEstData, 'filtered_gene_bc_matrices/GRCh38/')
holder <- readRDS(paste0(kEstFolder, 'pbmc8k_no_umi.rds')) genes <- read.table(paste0(k10xFolder, 'genes.tsv')) %>% filter(V2 %in% names(which(table(V2) == 1))) gene_id_to_names <- setNames(genes$V2, genes$V1) holder$cm_raw <- holder$cm_raw[grep("^[^;]+$", rownames(holder$cm_raw)),]
umis_per_cell <- sort(Matrix::colSums(holder$cm_raw), decreasing=T) est_cell_num <- EstimateCellsNumber(umis_per_cell)
scores <- ScorePipelineCells(holder, mit.chromosome.name='MT', predict.all=T)[names(umis_per_cell)] PlotCellScores(scores, cells.number=est_cell_num)
Pagoda run:
real_cbs <- names(scores)[1:est_cell_num$expected] real_cbs <- real_cbs[scores[real_cbs] > 0.9] r_cm <- holder$cm_raw[, real_cbs] r_cm <- r_cm[intersect(rownames(r_cm), names(gene_id_to_names)), ] rownames(r_cm) <- gene_id_to_names[rownames(r_cm)] pgd <- GetPagoda(r_cm, n.cores=30) # clusters <- pgd$clusters$PCA$infomap # write.csv(clusters, paste0(kAnnotationData, 'pbmc8k_clusters.csv')) # Pagoda uses stochastic clustering algorithm, so we saved clusters from one run clusters <- read.csv(paste0(kAnnotationData, 'pbmc8k_clusters.csv'), row.names=1) clusters <- setNames(clusters$x, rownames(clusters)) log_mtx <- log10(1e-3 + as.matrix(pgd$counts[names(clusters), ]))
Initial clustering:
PlotPagodaEmbeding(pgd, clusters=clusters, show.ticks=F)
de_genes <- pgd$getDifferentialGenes(type='PCA', groups=clusters, upregulated.only=T) %>% lapply(rownames) major_cell_types <- lst( `T cells` = sapply(de_genes, function(genes) 'CD3D' %in% genes) %>% which() %>% names() %>% as.integer(), `B cells` = sapply(de_genes, function(genes) 'MS4A1' %in% genes) %>% which() %>% names() %>% as.integer() ) major_type_clusts <- major_cell_types %>% unlist() if (length(major_type_clusts) != length(unique(major_type_clusts))) stop("Something goes wrong")
heatmap_genes <- c( 'MS4A1', 'CD3D', 'CD3E', 'LYZ', 'CD14', 'GNLY', 'NKG7', "GZMA", "GZMB", "GZMH", "GZMK", 'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP') heatmap_clusters <- clusters[!(clusters %in% unlist(major_cell_types))] # heatmap_clusters <- heatmap_clusters[heatmap_clusters > 20] PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)
type_ids <- c(major_cell_types, lst( `CD14+ Monocytes` = c(1, 12, 13, 34), `NK cells 1` = c(9, 29, 30), `NK cells 2` = 26, `FCGR3A+ Monocytes` = 15, `Dendritic cells` = c(18, 24), `Megakaryocytes` = 31 )) type_ids$`T cells` <- c(type_ids$`T cells`, 35) markers_df <- data.frame( Type = c( "B cells", "T cells", "CD14+ Monocytes", "NK cells 1", "NK cells 2", "FCGR3A+ Monocytes", "Dendritic cells", "Megakaryocytes"), Markers = c( "MS4A1", "CD3D, CD3E", "LYZ, CD14", "GNLY, NKG7", "GZMB, NKG7", "FCGR3A, MS4A7", "FCER1A, CST3", "PPBP") ) markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(markers_df$Type)] markers_df
clusters_annotated <- AnnotateClusters(clusters, type_ids) PlotClustering(pgd, clusters_annotated)
heatmap_genes <- c( "CD3D", "CD3E", "CCR7", "CD8A", "CD8B", "IL7R", "CD4", "NKG7", "GZMA", "GZMH", "GZMK" ) heatmap_clusters <- clusters[clusters %in% type_ids$`T cells`] # heatmap_clusters <- heatmap_clusters[heatmap_clusters > 3] PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)
t_markers_df <- data.frame( Type = c("Naive T cells", "CD8 T cells", "CD4+ T cells", "Cytotoxic T cells"), Markers = c("CCR7", "CD8A, CD8B", "IL7R, CD4", "NKG7, GZMA, GZMH, GZMK") ) type_ids <- c(type_ids, lst( `Naive T cells` = c(2, 7, 21), `CD8 T cells` = c(3), `CD4+ T cells` = c(5, 10, 14, 35), `Cytotoxic T cells` = c(6, 11, 16, 17, 20, 25, 27, 32) )) type_ids$`T cells` <- NULL t_markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(t_markers_df$Type)] t_markers_df
clusters_annotated <- AnnotateClusters(clusters, type_ids) write.csv(data.frame(Barcode=names(clusters_annotated), Type=as.vector(clusters_annotated)), paste0(kAnnotationData, 'pbmc8k_clusters_annotated.csv'))
all_markers_df <- rbind(markers_df, t_markers_df) write.csv(all_markers_df[c('Type', 'Markers')], paste0(kOutputPath, 'tables/annotation_pbmc8k_markers.csv'), row.names=F) all_markers_df
raster_width <- 8 / 3 raster_height <- 8 / 4 raster_dpi <- 150 long_type_names <- c("CD14+ Monocytes", "FCGR3A+ Monocytes", "Dendritic cells", "Naive T cells", "Cytotoxic T cells", "CD4+ T cells") for (type in long_type_names) { clusters_annotated[clusters_annotated == type] <- sub(" ", "\n", type) } gg_annotation <- PlotClustering(pgd, clusters_annotated, lineheight=0.7, size=0.2, raster=T, raster.width=raster_width, raster.height=raster_height, raster.dpi=raster_dpi) + theme_pdf(show.ticks=F) + ggpubr::rremove("xylab") + scale_size_continuous(range=c(3, 3)) + theme(plot.margin=margin())
plot_mtx <- apply(log_mtx, 2, function(vec) scales::rescale(rank(vec)))
plot_genes <- c('MS4A1', 'LYZ', 'FCGR3A', 'FCER1A', 'PPBP', 'CD3D', 'CCR7', 'CD8A', 'IL7R', 'NKG7', 'GZMB') gene_plots <- lapply(plot_genes, PlotGeneFraction, pgd, plot_mtx, title.x=0.04, title.y=0.99, legend.position="none", size=0.2, raster=T, raster.width=raster_width, raster.height=raster_height, raster.dpi=raster_dpi) gene_plots <- c(list(gg_annotation), gene_plots) gg_fig <- cowplot::plot_grid(plotlist=gene_plots, ncol=3) + theme(plot.margin=margin(1, 1, 1, 1))
gg_fig
ggsave(paste0(kOutputPath, 'figures/supp_annotation_pbmc8k.pdf'), width=8, height=8)
# # Web app # go_env <- p2.generate.human.go(pgd) # pgd$testPathwayOverdispersion(setenv = go_env, verbose = T, correlation.distance.threshold = 0.9, # recalculate.pca = F, min.pathway.size = 100, max.pathway.size = 1000) # # go_sets <- p2.generate.human.go.web(colnames(pgd$counts)) # de_sets <- get.de.geneset(pgd, groups = pgd$clusters$PCA$infomap, prefix = 'de_') # go_sets <- c(go_sets, de_sets) # # additional_metadata <- list() # additional_metadata$altCluster <- p2.metadata.from.factor(as.factor(clusters_annotated), displayname = 'Annotated', s = 0.7, v = 0.8, start = 0, end = 0.5) # # pgd_web_object <- make.p2.app(pgd, dendrogramCellGroups = pgd$clusters$PCA$infomap, # additionalMetadata = additional_metadata, geneSets = go_sets, # show.clusters = T) # # pgd_web_object$serializeToStaticFast(binary.filename = paste0(kEstFolder, 'pbmc8k_pagoda.bin')) # # saveRDS(pgd_web_object, paste0(kEstFolder, 'pagoda_annotation_web.rds'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.