In VPetukhov/dropEstAnalysis: Analysis to produce figures from dropEst paper

knitr::read_chunk("../../analysis/chunks.R")

library(ggplot2)
library(dplyr)
library(dropestr)
library(dropEstAnalysis)
library(Matrix)

theme_set(theme_base)

set.seed(42)
kDropEstData <- '../../data/dropest/10x/frozen_bmmc_healthy_donor1/'
kEstFolder <- paste0(kDropEstData, 'est_11_10_umi_quality/')
k10xFolder <- paste0(kDropEstData, 'filtered_matrices_mex/hg19/')
kAnnotationData <- '../../data/annotation/'
kOutputPath <- '../../output/'

Read data

Link to original dataset.

holder <- readRDS(paste0(kEstFolder, 'bmmc_no_umi.rds'))
genes <- read.table(paste0(k10xFolder, 'genes.tsv')) %>% 
  filter(V2 %in% names(which(table(V2) == 1)))
gene_id_to_names <- setNames(genes$V2, genes$V1)
holder$cm <- holder$cm[grep("^[^;]+$", rownames(holder$cm)),]
holder$cm_raw <- holder$cm_raw[grep("^[^;]+$", rownames(holder$cm_raw)),]

umis_per_cell <- sort(Matrix::colSums(holder$cm_raw), decreasing=T)
est_cell_num <- EstimateCellsNumber(umis_per_cell)

Quality scores:

scores <- ScorePipelineCells(holder, mit.chromosome.name='MT', 
                             predict.all=T, verbose=T)[names(umis_per_cell)]
PlotCellScores(scores, cells.number=est_cell_num)

Pagoda run:

real_cbs <- names(scores)[1:est_cell_num$expected]
real_cbs <- real_cbs[scores[real_cbs] > 0.5]
real_cbs <- union(names(scores)[scores > 0.9], real_cbs)

r_cm <- holder$cm_raw[, real_cbs]
r_cm <- r_cm[intersect(rownames(r_cm), names(gene_id_to_names)), ]
rownames(r_cm) <- gene_id_to_names[rownames(r_cm)]

pgd <- GetPagoda(r_cm, n.cores=30, tsne.iter.num=5000)

# clusters <- pgd$clusters$PCA$infomap
# write.csv(clusters, paste0(kAnnotationData, 'bmmc1_clusters.csv'))

# Pagoda uses stochastic clustering algorithm, so we saved clusters from one run
clusters <- read.csv(paste0(kAnnotationData, 'bmmc1_clusters.csv'), row.names=1)
clusters <- setNames(clusters$x, rownames(clusters))
log_mtx <- log10(1e-3 + as.matrix(pgd$counts[names(clusters), ]))

Initial clustering:

PlotPagodaEmbeding(pgd, clusters=clusters, show.ticks=F)

Initial labeling

Description:
https://www.bdbiosciences.com/documents/Bcell_Brochure.pdf - B cells
https://www.bdbiosciences.com/documents/cd_marker_handbook.pdf - CD Markers

de_genes <- pgd$getDifferentialGenes(type='PCA', groups=clusters, 
                                     upregulated.only=T) %>% lapply(rownames)

major_cell_types <- lst(
  `T cells` = sapply(de_genes, function(genes) 'CD3D' %in% genes) %>% 
    which() %>% names() %>% as.integer(),
  `B cells` = sapply(de_genes, function(genes) 'MS4A1' %in% genes) %>% 
    which() %>% names() %>% as.integer()
)

major_type_clusts <- major_cell_types %>% unlist()
if (length(major_type_clusts) != length(unique(major_type_clusts))) 
  stop("Something goes wrong")

heatmap_genes <- c(
  'CD19', 'MME', 'MS4A1',
  'CD3D',
  'LYZ', 'CD14',
  'GZMA', 'GZMB', 'GNLY', 'NKG7',
  'FCER1A', 'CST3',
  'CD34', 'PTPRC', 'ITGB1', 'ENG',
  'EPCAM', 'APOE',
  'GYPA', 'CD36', 'TFRC'
  )

heatmap_clusters <- clusters[!(clusters %in% unlist(major_cell_types))]
heatmap_clusters <- heatmap_clusters[heatmap_clusters > 9]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)

type_ids <- c(major_cell_types, lst(
  `CD14+ Monocytes` = c(2),
  `NK cells` = c(4),
  `Dendritic cells` = c(14),
  `Monocyte progenitors` = c(11, 15),
  `Epithelial cells` = c(9),
  `Erythroblasts` = c(5, 6, 8)
))

type_ids$`B cells` <- c(type_ids$`B cells`, 10, 12:13)
type_ids$`T cells` <- c(type_ids$`T cells`, 16:17)

markers_df <- data.frame(
  Type = c("B cells", "T cells", "CD14+ Monocytes", "NK cells", "Dendritic cells", 
           "Monocyte progenitors", "Epithelial cells", "Erythroblasts"),
  Markers = c("CD19, MME (CD10), MS4A1 (CD20)", "CD3D", 
              "LYZ, CD14", "GZMA, GZMB, GNLY, NKG7", "FCER1A, CST3", 
              "LYZ, CD34", 
              "EPCAM (CD326), CD226-, APOE (CD165)", "GYPA (CD235a), CD36, TFRC (CD71)")
)

markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(markers_df$Type)]
markers_df

clusters_annotated <- AnnotateClusters(clusters, type_ids)
PlotClustering(pgd, clusters_annotated)

B cells

heatmap_genes <- c(
  'MS4A1', 'CD40', 'IL4R', 'IL7R',
  'CD34', 'CD38', 'MME',
  'CD19',
  'CD37', 'IGLL5')

heatmap_clusters <- clusters[clusters %in% type_ids$`B cells`]
# heatmap_clusters <- heatmap_clusters[heatmap_clusters > 9]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)

b_markers_df <- data.frame(
  Type = c("Immature B cells", "Pre-pro B cells", "Pre B cells", "Non-dividing Pre B cells"),
  Markers = c("MS4A1 (CD20), CD40, IL4R, IL7R-", "CD34, CD38, MME (CD10), CD24-, IL7R-", 
              "CD34-, CD40-, IL7R+, IL4R-, CD19+", "CD34, IGLL5")
)

type_ids <- c(type_ids, lst(
  `Immature B cells` = c(3),
  `Pre-pro B cells` = c(13),
  `Non-dividing Pre B cells` = c(10),
  `Pre B cells` = c(12)
  ))

type_ids$`B cells` <- NULL

b_markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(b_markers_df$Type)]
b_markers_df

T cells

heatmap_genes <- c('CCR7', "CD3E", "CD8B", "SELL", "GNLY", "GZMA", "GZMB", "GZMH", 
                   "GZMK", "PRF1", "NKG7", "IL7R", "CD4")

heatmap_clusters <- clusters[clusters %in% type_ids$`T cells`]
PlotExpressionHeatmap(log_mtx, heatmap_clusters, heatmap_genes)

t_markers_df <- data.frame(
  Type = c("Cytotoxic T cells", "T cells"),
  Markers = c("NKG7, GZMA, GZMH, GZMK", "CD3E, CD8B, IL7R")
)

type_ids <- c(type_ids, lst(
  `Cytotoxic T cells` = c(7)
))

type_ids$`T cells` <- setdiff(type_ids$`T cells`, type_ids$`Cytotoxic T cells`)

t_markers_df$Clusters <- sapply(type_ids, paste, collapse=", ")[as.character(t_markers_df$Type)]
t_markers_df

clusters_annotated <- AnnotateClusters(clusters, type_ids)
write.csv(data.frame(Barcode=names(clusters_annotated), 
                     Type=as.vector(clusters_annotated)), 
          paste0(kAnnotationData, 'bmmc1_clusters_annotated.csv'))

All markers

all_markers_df <- bind_rows(list(markers_df, t_markers_df, b_markers_df))
write.csv(all_markers_df[c('Type', 'Markers')], 
          paste0(kOutputPath, 'tables/annotation_bmmc1_markers.csv'), row.names=F)
all_markers_df

Expression plots

plot_mtx <- apply(log_mtx, 2, function(vec) scales::rescale(rank(vec)))

raster_width <- 8 / 3
raster_height <- 8 / 4
raster_dpi <- 150

clusters_annotated <- AnnotateClusters(clusters, type_ids)
plot_clusters_annotated <- clusters_annotated

long_type_names <- c("CD14+ Monocytes", "Non-dividing Pre B cells", "Monocyte progenitors", 
                     "Epithelial cells", "Cytotoxic T cells", "Immature B cells", 
                     "Dendritic cells", "Pre-pro B cells")
for (type in long_type_names) {
  plot_clusters_annotated[plot_clusters_annotated == type] <- sub(" ", "\n", type)
}

gg_annotated <- PlotClustering(pgd, plot_clusters_annotated, lineheight=0.9, size=0.3, 
                               raster=T, raster.width=raster_width, 
                               raster.height=raster_height, raster.dpi=raster_dpi) +
  scale_size_continuous(range=c(2, 3)) + 
  theme(axis.title=element_blank(), plot.margin=margin())

plot_genes <- c('CD3D', 'NKG7', 
                'MS4A1', 'IGLL5', 'CD37', 
                'GYPA', 'EPCAM', 
                'FCER1A', 'CD14', 'LYZ', 'CD34')

gene_plots <-  lapply(plot_genes, PlotGeneFraction, pgd, plot_mtx, title.x=0.04, 
                      title.y=0.99, legend.position="none", size=0.3, 
                      raster=T, raster.width=raster_width, 
                      raster.height=raster_height, raster.dpi=raster_dpi)

gene_plots <- c(list(gg_annotated), gene_plots)

gg_fig <- cowplot::plot_grid(plotlist=gene_plots, ncol=3) +
  theme(plot.margin=margin(1, 1, 1, 1))

gg_fig

ggsave(paste0(kOutputPath, 'figures/supp_annotation_bmmc1.pdf'), width=8, height=8)

# # Web app
# library(pagoda2)
# go_env <- p2.generate.human.go(pgd)
# pgd$testPathwayOverdispersion(setenv = go_env, verbose = T, correlation.distance.threshold = 0.9,
#                               recalculate.pca = F, min.pathway.size = 100, max.pathway.size = 1000)
# 
# go_sets <- p2.generate.human.go.web(colnames(pgd$counts))
# de_sets <- get.de.geneset(pgd, groups = pgd$clusters$PCA$infomap, prefix = 'de_')
# go_sets <- c(go_sets, de_sets)
# 
# additional_metadata <- list()
# additional_metadata$altCluster <- as.factor(clusters_annotated) %>%
#   p2.metadata.from.factor(displayname='Annotated', s=0.7, v=0.8, start=0, end=0.5)
# 
# is_rescued <- rep(T, ncol(r_cm)) %>% setNames(colnames(r_cm))
# is_rescued[intersect(names(umis_per_cell)[1:2000], colnames(r_cm))] <- F
# additional_metadata$isRescued <- p2.metadata.from.factor(as.factor(is_rescued), displayname='IsRescued')
# 
# pgd_web_object <- make.p2.app(pgd, dendrogramCellGroups = pgd$clusters$PCA$infomap,
#                               additionalMetadata = additional_metadata,
#                               geneSets = go_sets, show.clusters = T)
# 
# pgd_web_object$serializeToStaticFast(binary.filename = paste0(kEstFolder, 'bmmc1_pagoda_annotated.bin'))
# saveRDS(pgd_web_object, paste0(kEstFolder, 'pagoda_annotation_web.rds'))
# 
# # show.app(pgd_web_object, "bmmc")