knitr::opts_chunk$set(echo = TRUE, 
                      message=T, warning=T, 
                      cache = F, cache.lazy = F,
                      dev = "png", dpi = 100, fig.width = 10, fig.height = 10)

parameter

change the settings for your situation

h5file_10x <- params$h5file
save2seurat <- params$seuratfile
hto_info <- setNames(params$sample,
                     params$hashtag)

shoulders of giants

加载依赖包

options(stringsAsFactors = F)
# rm(list = ls())

library(Seurat)
library(magrittr)

library(ggplot2)
library(patchwork)

library(ggExtra)
library(stringr)
library(tidyverse)

HTO classification

读取原始表达矩阵数据

h5d <- Read10X_h5(h5file_10x)

# Gene Expression UMI data
h5 <- CreateSeuratObject(counts = h5d$`Gene Expression`)
# ncount derived from MT genes
mtgenes <- grep("MT-", rownames(h5), value = T, ignore.case = T)
# in case macaca mito genes
if(length(mtgenes) == 0) mtgenes <- read.csv(file = system.file("rmd", "mcc_MT_genes.csv", package = "convgene"))$gene_id

h5$nMT_RNA <- Matrix::colSums(Assays(h5, slot = "RNA")[mtgenes,])
h5$rMT_RNA <- h5$nMT_RNA/h5$nCount_RNA

Demultiplex

HashTag数据中,每个细胞带有预先标记的HashTag Oligos(HTO)。 理想情况下,每个细胞应该只能测到一种HTO标签, 但是实际HTO数据中,并不是那么完美的每个细胞只带有一种oligo序列。 需要我们先对HTO数据进行Demutiplex分析,推测细胞真正的HTO标签。

# Add HTO data as a new assay independent from RNA
HTOdata <- h5d$`Antibody Capture`
rownames(HTOdata) <- dplyr::recode(rownames(HTOdata), !!!hto_info)
h5[["HTO"]] <- CreateAssayObject(counts = HTOdata[hto_info,])

# Normalize HTO data, here we use centered log-ratio (CLR) transformation
h5 <- NormalizeData(h5, assay = "HTO", normalization.method = "CLR")
h5 <- HTODemux(h5, assay = "HTO", positive.quantile = 0.99)
h5$HTO_maxID <- factor(h5$HTO_maxID, hto_info)

## check classification
# 可视化每个HTO标签的分布
# Group cells based on the max HTO signal
RidgePlot(h5, assay = "HTO", group.by = 'HTO_maxID', #sort = T,
          features = rownames(h5[["HTO"]]), ncol = 2)

Cell Numbers

# Global classification results
table(h5$HTO_classification.global)
table(h5$HTO_classification.global)/ncol(h5)

table(h5$HTO_classification.global, h5$HTO_maxID)
FeatureScatter(h5, feature1 = "hto_Brain", feature2 = "hto_Lung", pt.size = 3)

Visualization of Quality control

不同大类下,UMI数量分布

VlnPlot(h5, features = "nCount_RNA", group.by = "HTO_classification.global",
        pt.size = 0.1, log = TRUE)

check doublet and classification

###########
Idents(h5) <- "HTO_classification.global"

# First, we will remove negative cells from the object - NOT USED!
#h5.positive <- subset(h5, idents = "Negative", invert = TRUE)

# Calculate a distance matrix using HTO
hto.dist.mtx <- as.matrix(dist(t(GetAssayData(object = h5, assay = "HTO"))))

# Calculate tSNE embeddings with a distance matrix
h5 <- RunTSNE(h5, distance.matrix = hto.dist.mtx, perplexity = 100)

# Visualization
DimPlot(h5, group.by = "HTO_classification.global")
DimPlot(h5, group.by = "HTO_classification", cells = colnames(h5)[h5$HTO_classification.global == "Singlet"])

热图可视化HTO标签分类

# To increase the efficiency of plotting, you can subsample cells using the num.cells argument
HTOHeatmap(h5, assay = "HTO", ncells = 1000)

check MT

核查线粒体基因的转录本占比情况,推测细胞是否状态良好或可能处于应激状态

MT比例直方图

ggplot(data = h5@meta.data) +
  geom_histogram(mapping = aes(x=rMT_RNA), bins = 1000) +
  geom_vline(mapping = aes(xintercept = 0.15), linetype = 2, color ="red") +
  annotate(geom = "text", x = 0.15, y = 100, label = "x = 0.15", hjust = -0.1) +
  labs(title = "Histogram showing distribution of UMI proportion of MT genes")

MT比例直方图,区分不同取材位点

# 
ggplot(data = h5@meta.data) +
  geom_histogram(mapping = aes(x=rMT_RNA, fill = HTO_maxID), bins = 1000) +
  facet_wrap(~HTO_maxID) +
  geom_vline(mapping = aes(xintercept = 0.15), linetype = 2, color ="red") +
  annotate(geom = "text", x = 0.15,y=40, label = "x = 0.15", hjust = -0.1) +
  labs(title = "Histogram showing distribution of UMI proportion of MT genes")

MT比例 vs nUMI,区分不同取材位点

# ratio - scatter
ggplot(data = h5@meta.data) + 
  geom_point(mapping = aes(x = log10(nCount_RNA + 1),
                           y = rMT_RNA,
                           color = HTO_maxID)) +
  geom_density2d(mapping = aes(x = log10(nCount_RNA + 1),
                           y =  rMT_RNA),
                 color = "blue") +
  geom_vline(mapping = aes(xintercept = log10(5e3+1)),
             linetype = 2) +
  # geom_vline(mapping = aes(xintercept = log10(1e6+1)),
  #            linetype = 2) +
  #geom_hline(mapping = aes(yintercept = 2000), linetype = 2) +
  #scale_color_manual(values = batch_color) +
  #scale_y_continuous(breaks = seq(0,9000, by = 1000))+
  facet_wrap(~HTO_maxID) +
  labs(y = "Ratio of UMIs from MT genes",
       x = "Number of UMIs, log10 scaled",
       title = str_interp("All sequenced cells \n(n = ${ncol(h5)})")) +
  theme(legend.position = "bottom")

nGene and nUMI of sequenced cells

DefaultAssay(h5) <- "RNA"

# sum(h5$nCount_RNA > 5000)
# sum(h5$nFeature_RNA > 2000)
# sum(h5$nCount_RNA > 5000 & h5$nFeature_RNA > 2000)

ggplot(data = h5@meta.data) + 
  geom_point(mapping = aes(x = log10(nCount_RNA + 1),
                           y = nFeature_RNA,
                           color = HTO_classification.global)) +
  geom_density2d(mapping = aes(x = log10(nCount_RNA + 1),
                           y = nFeature_RNA),
                 color = "blue") +
  geom_vline(mapping = aes(xintercept = log10(5e3+1)),
             linetype = 2) +
  # geom_vline(mapping = aes(xintercept = log10(1e6+1)),
  #            linetype = 2) +
  geom_hline(mapping = aes(yintercept = 2000),
             linetype = 2) +
  #scale_color_manual(values = batch_color) +
  scale_y_continuous(breaks = seq(0,11000, by = 1000))+
  facet_wrap(~HTO_classification.global) +
  labs(y = "Number of expressed genes",
       x = "Number of UMIs, log10 scaled",
       title = str_interp("All sequenced cells \n(n = ${ncol(h5)})")) +
  theme(legend.position = "bottom")
# filter cells and genes
p <- ggplot(data = h5@meta.data) + 
  geom_point(mapping = aes(x = log10(nCount_RNA + 1),
                           y = nFeature_RNA,
                           color = HTO_maxID)) +
  geom_density2d(mapping = aes(x = log10(nCount_RNA + 1),
                           y = nFeature_RNA),
                 color = "blue") +
  geom_vline(mapping = aes(xintercept = log10(5e3+1)),
             linetype = 2) +
  # geom_vline(mapping = aes(xintercept = log10(1e6+1)),
  #            linetype = 2) +
  geom_hline(mapping = aes(yintercept = 2000),
             linetype = 2) +
  #scale_color_manual(values = batch_color) +
  scale_y_continuous(breaks = seq(0,11000, by = 1000))+
  labs(y = "Number of expressed genes",
       x = "Number of UMIs, log10 scaled",
       title = str_interp("All sequenced cells \n(n = ${ncol(h5)})")) +
  theme(legend.position = "bottom") #+ coord_fixed(3.5e-4)
ggExtra::ggMarginal(p, colour = "darkblue", fill = "grey", type = "densigram")
ggplot(data = h5@meta.data) + 
  geom_point(mapping = aes(x = log10(nCount_RNA + 1),
                           y = nFeature_RNA,
                           color = HTO_maxID)) +
  geom_density2d(mapping = aes(x = log10(nCount_RNA + 1),
                           y = nFeature_RNA),
                 color = "blue") +
  geom_vline(mapping = aes(xintercept = log10(5e3+1)),
             linetype = 2) +
  # geom_vline(mapping = aes(xintercept = log10(1e6+1)),
  #            linetype = 2) +
  geom_hline(mapping = aes(yintercept = 2000),
             linetype = 2) +
  #scale_color_manual(values = batch_color) +
  scale_y_continuous(breaks = seq(0,11000, by = 1000))+
  facet_wrap(~HTO_maxID) +
  labs(y = "Number of expressed genes",
       x = "Number of UMIs, log10 scaled",
       title = str_interp("All sequenced cells \n(n = ${ncol(h5)})")) +
  theme(legend.position = "bottom") #+ coord_fixed(3.5e-4)

save seurat

if(!is.null(save2seurat)){
  save(h5, file = save2seurat)
}


zzwch/convgene documentation built on July 11, 2021, 9:41 a.m.