In LarsenLab/immuntools: Larsen lab data analysis toolbox

library(DropletUtils)
library(Seurat)
library(outliers)

library(tidyverse)

flag <- tolower(params$flag)

if (flag == "seurat"){
    print("QC based on Seurat object")

    # Load the .RData, and store the name of the loaded object in tmp
    tmp = load(params$obj_se)
    # Get the object by its name
    obj = get(tmp)
    rm(tmp)

    # add percent of mito to the object
    mito.features <- grep(pattern = "^MT-", x = rownames(x = obj), value = TRUE)
    percent.mt <- Matrix::colSums(x = GetAssayData(object = obj, slot = 'counts')[mito.features, ]) / Matrix::colSums(x = GetAssayData(object = obj, slot = 'counts'))
    obj[['percent.mt']] <- percent.mt
}

if (flag == "sce"){
    print("QC based on SingleCellExperiment object")

    # Load the .RData, and store the name of the loaded object in tmp
    tmp = load(params$obj_sce)
    # Get the object by its name
    obj = get(tmp)
    rm(tmp)
}

Sample Information

print(obj)

RNA

overview before QC

vln_before <- VlnPlot(obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
sct1_before <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
sct2_before <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
vln_before
CombinePlots(plots = list(sct1_before, sct2_before))

#rank of cells(barcodes) on total UMI count average rank of each cell/barcode corss gene/total UMI count
bcrank_before <- barcodeRanks(obj)
uniq = !duplicated(bcrank_before$rank)

plot(bcrank_before$rank[uniq], bcrank_before$total[uniq], log="xy", main = "barcode ranking - before QC", xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)
abline(h=metadata(bcrank_before)$inflection, col="blue", lty=2) 
abline(h=metadata(bcrank_before)$knee, col="red", lty=2) 
legend("left", legend=c("Inflection", "Knee"), bty="n", col=c("blue", "red"), lty=2, cex=1.2)

after preliminary QC

find_ot <- function(vec, prob){
  otvalue <- vec[scores(vec, type = "chisq", prob = prob) == TRUE]
  lo = max(otvalue[otvalue<median(vec)])
  hi = min(otvalue[otvalue>median(vec)])
  return(list(lo,hi))
}

# cell level
ot_mito <- as.numeric(find_ot(obj@meta.data$percent.mt, params$cut_mt_se))[2]
ot_nfeature <- as.numeric(find_ot(obj@meta.data$nFeature_RNA, params$cut_nFeature_se))
ot_ncount <- as.numeric(find_ot(obj@meta.data$nCount_RNA, 0.9))
# note: ot_ncount is not used in the subset()
obj <- subset(obj, subset = nFeature_RNA > ot_nfeature[1] & nFeature_RNA < ot_nfeature[2] & percent.mt < ot_mito) 

assay <- as.matrix(GetAssayData(object = obj, slot = "counts"))
genesum <- log(rowSums(assay) + 1 )
genelist <- names(genesum)[genesum > 0]
obj <- subset(obj, features = c(genelist, rownames(obj@assays$ADT)))  # have to specify ADT name to avoid deletion due to duplicate names between ADT and RNA

print(obj)

# plots after preliminary QC
vln_after <- VlnPlot(obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
sct1_after <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
sct2_after <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
vln_after
CombinePlots(plots = list(sct1_after, sct2_after))

# rank of cells(barcodes) on total UMI count average rank of each cell/barcode corss gene/total UMI count
bcrank_after <- barcodeRanks(obj)
uniq = !duplicated(bcrank_after$rank)

plot(bcrank_after$rank[uniq], bcrank_after$total[uniq], log="xy", main = "barcode ranking - after QC", xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)
abline(h=metadata(bcrank_after)$inflection, col="blue", lty=2) 
abline(h=metadata(bcrank_after)$knee, col="red", lty=2) 
legend("left", legend=c("Inflection", "Knee"), bty="n", col=c("blue", "red"), lty=2, cex=1.2)

# remove all objects except obj
# rm(list = setdiff(ls(), "obj") )

Clusters after standard pre-processing

obj <- NormalizeData(obj, normalization.method = "LogNormalize", scale.factor = 10000)
obj <- FindVariableFeatures(obj, selection.method = "vst", nfeatures = 2000)

all.genes <- rownames(x = obj)
# have check these 4 samples, they don't need regress out cc genes
obj <- ScaleData(object = obj, all.genes, vars.to.regress = c("nCount_RNA", "percent.mito"))

obj <- RunPCA(obj, features = VariableFeatures(object = obj))
obj <- FindNeighbors(obj, dims = 1:10)
obj <- FindClusters(obj, resolution = 0.8)
head(Idents(obj), 5)

obj <- RunUMAP(obj, dims = 1:10)
LabelClusters(plot = DimPlot(obj, reduction = "umap", group.by = "RNA_snn_res.0.8"), id = "RNA_snn_res.0.8", size = 4)

ADT

obj <- NormalizeData(obj, assay = "ADT", normalization.method = "CLR")
obj <- ScaleData(obj, assay = "ADT")

cd4 <- as.matrix(obj@assays$ADT@scale.data)
cd4 <- cd4[rownames(cd4) == "CD4", ]
cd4 <- data.frame(cd4)

print("cd4 quantiles")
quantile(cd4$cd4, prob = seq(0,1,0.05))

ggplot(cd4, aes(x=cd4)) + geom_density() + ggtitle("ADT cd4 distribution")