library(knitr) opts_chunk$set(out.width = "100%", cache = TRUE) knitr::opts_chunk$set(dev = "png", dev.args = list(type = "cairo-png")) options(repos = c(CRAN = 'https://cloud.r-project.org'))
Dependencies:
# OVC library(subtypeHeterogeneity) library(consensusOV) # Single cell library(scater) library(scran) library(DropletUtils) library(SingleR) # Annotation library(EnsDb.Hsapiens.v86) library(org.Hs.eg.db) library(EnrichmentBrowser) # Plotting library(ComplexHeatmap) library(ggpubr)
Constants:
SUBTYPES <- c("DIF", "IMR", "MES", "PRO") CELL.TYPES <- c("EPI", "LYMPH", "MYE", "STROM", "ENDO") data.dir <- system.file("extdata", package = "subtypeHeterogeneity")
Colors
cb.pink <- "#CC79A7" cb.red <- "#D55E00" cb.blue <- "#0072B2" cb.yellow <- "#F0E442" cb.green <- "#009E73" cb.lightblue <- "#56B4E9" cb.orange <- "#E69F00" stcols <- c(cb.lightblue, cb.green, cb.orange, cb.pink) names(stcols) <- SUBTYPES
Parallel computation:
bp <- BiocParallel::registered()[[1]]
Note: The sections Preprocessing, Dimensionsality reduction, and Clustering describe the processing of the CellRanger output files at the example of Tumor T90. Tumors T59, T76, T77, and T89 were processed analogously. Execution of the code in these sections requires download of the CellRanger output files deposited in GEO (accession number: GSE154600).
Section Annotation, Subsection Visualization of results (all tumors),
and subsequent sections describe how Figures 4 and 5 of the main manuscript
were created.
The code is based on the fully processed and annotated SingleCellExperiment
s
(resulting from processing steps described in Section 2 - Section 5.3).
Read 10X output (required files: matrix.mtx
, barcodes.tsv
, genes.tsv
)
sce <- DropletUtils::read10xCounts("tumor90") dim(sce)
Annotating the rows:
rownames(sce) <- scater::uniquifyFeatureNames(rowData(sce)$ID, rowData(sce)$Symbol) location <- AnnotationDbi::mapIds(EnsDb.Hsapiens.v86, keys = rowData(sce)$ID, column = "SEQNAME", keytype = "GENEID")
Testing for deviations from ambient expression:
bcrank <- DropletUtils::barcodeRanks(counts(sce)) uniq <- !duplicated(bcrank$rank) plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy", xlab="Rank", ylab="Total UMI count", cex.lab=1.2) abline(h=bcrank$inflection, col="darkgreen", lty=2) abline(h=bcrank$knee, col="dodgerblue", lty=2) legend("bottomleft", legend=c("Inflection", "Knee"), col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)
Quality control on the cells:
df <- scater::perCellQCMetrics(sce, subsets = list(Mito = which(location == "MT")), BPPARAM = bp) par(mfrow = c(1,3)) hist(df$sum/1e3, xlab="Library sizes (thousands)", main="", breaks=20, col="grey80", ylab="Number of cells") hist(df$detected, xlab="Number of expressed genes", main="", breaks=20, col="grey80", ylab="Number of cells") hist(df$subsets_Mito_percent, xlab="Mitochondrial proportion (%)", ylab="Number of cells", breaks=20, main="", col="grey80") par(mfrow = c(1,1)) high.mito <- scater::isOutlier(df$subsets_Mito_percent, nmads = 3, type = "higher") libsize.drop <- scater::isOutlier(df$sum, nmads = 1, type = "lower", log = TRUE) feature.drop <- scater::isOutlier(df$detected, nmads = 1, type = "lower", log = TRUE) sce <- sce[,!(high.mito | libsize.drop | feature.drop)] df <- data.frame(ByHighMito = sum(high.mito), ByLibSize = sum(libsize.drop), ByFeature = sum(feature.drop), Remaining = ncol(sce)) df dim(sce)
Examining gene expression:
ave <- scater::calculateAverage(sce, BPPARAM = bp) hist(log10(ave), breaks = 100, main = "", col = "grey", xlab = expression(Log[10]~"average count"))
Remove genes that have too low average counts
rowData(sce)$AveCount <- ave to.keep <- ave > 0.001 sce <- sce[to.keep,] # Number excluded genes sum(!to.keep) dim(sce)
Library-size normalization:
clusters <- scran::quickCluster(sce, method = "igraph", min.mean = 0.1, BPPARAM = bp) sce <- scran::computeSumFactors(sce, min.mean = 0.1, cluster = clusters, BPPARAM = bp) plot(scater::librarySizeFactors(sce), sizeFactors(sce), pch = 16, xlab = "Library size factors", ylab = "Deconvolution factors", log = "xy") sce <- scater::logNormCounts(sce) sce
Modelling the mean-variance trend:
dec.pbmc <- scran::modelGeneVarByPoisson(sce, BPPARAM = bp) top.pbmc <- scran::getTopHVGs(dec.pbmc, prop = 0.1) plot(dec.pbmc$mean, dec.pbmc$total, pch=16, cex=0.5, xlab="Mean of log-expression", ylab="Variance of log-expression") curfit <- metadata(dec.pbmc) curve(curfit$trend(x), col = 'dodgerblue', add = TRUE, lwd = 2)
Denoise log-expression data by removing principal components corresponding to technical noise (might take a while):
sce <- scran::denoisePCA(sce, subset.row = top.pbmc, technical = dec.pbmc, BPPARAM = bp) ncol(reducedDim(sce, "PCA")) plot(attr(reducedDim(sce), "percentVar"), xlab = "PC", ylab = "Proportion of variance explained") abline(v = ncol(reducedDim(sce, "PCA")), lty = 2, col = "red")
tSNE:
sce <- scater::runTSNE(sce, dimred = "PCA", perplexity = 30, BPPARAM = bp) sce$sizeFactor <- sizeFactors(sce) scater::plotTSNE(sce, colour_by = "sizeFactor")
Clustering with graph-based methods:
snn.gr <- scran::buildSNNGraph(sce, use.dimred = "PCA", BPPARAM = bp, k = 25) clusters <- igraph::cluster_walktrap(snn.gr) sce$Cluster <- factor(clusters$membership) table(sce$Cluster)
cluster.mod <- scran::clusterModularity(snn.gr, sce$Cluster, get.weights = TRUE) log.ratio <- log2(cluster.mod$observed / cluster.mod$expected + 1) ComplexHeatmap::Heatmap(log.ratio, cluster_rows = FALSE, cluster_columns = FALSE) scater::plotTSNE(sce, colour_by = "Cluster")
Transfer cell type annotation from reference profiles in the Human Primary Cell Atlas and ENCODE:
hpc <- transferCellType(sce, "hpca") encode <- transferCellType(sce, "encode") sce$hpca.celltype <- hpc$labels sce$encode.celltype <- encode$labels maxScore <- function(res) vapply(1:nrow(res), function(i) res$scores[i, res[i,"labels"]], numeric(1)) sce$hpca.celltype.score <- maxScore(hpc) sce$encode.celltype.score <- maxScore(hpc)
Display main cell types present:
plotMainCellTypes(sce, "hpca.celltype") plotMainCellTypes(sce, "encode.celltype")
Display ambiguity of cell type assignments:
SingleR::plotScoreHeatmap(hpc) SingleR::plotScoreHeatmap(encode)
Adopted from Shih et al., PLoS One (2018):
epithelial <- c("EPCAM", "KRT8", "KRT18", "KRT19") lymphocyte <- c("PTPRC", "CD3E", "CD19", "MS4A1") endothelial <- c("PECAM1", "CD34") fibroblast <- c("ACTA2", "DCN", "ACTB") stromal <- c("THY1", "ENG", "VIM", "CD44") origin <- c("PAX8", "CALB2", "COL11A1") scater::plotTSNE(sce, colour_by = epithelial[1])
sce.entrez <- EnrichmentBrowser::idMap(sce, org = "hsa", from = "SYMBOL", to = "ENTREZID") am <- as.matrix(assay(sce.entrez, "logcounts")) cst <- consensusOV::get.consensus.subtypes(am, names(sce.entrez)) sts <- sub("_consensus$", "", as.vector(cst$consensusOV.subtypes)) sce$subtype <- sts sce$margin <- consensusOV::margin(cst$rf.probs) summary(sce$margin) scater::plotTSNE(sce, colour_by = "subtype") scater::plotTSNE(sce, colour_by = "margin")
Read the pre-processed 10X data for all tumors as list
of SingleCellExperiment
s.
tumors <- c(59, 76, 77, 89, 90) tags <- c("cu0pxp0sepsvhos", "taff3g6nlmf5yk4", "2otdxfunxnlr6ql", "sojupsrjxnvnf6d", "ydokzcxgkugo678") durl <- "https://dl.dropboxusercontent.com/s/tag/sampleXX_sce.rds"
readSCE <- function(i) { dl <- sub("XX", tumors[i], durl) dl <- sub("tag", tags[i], dl) readRDS(file(dl)) }
sces <- lapply(1:5, readSCE) names(sces) <- paste0("T", tumors) sces
Summarize cell type annotation:
sces <- lapply(sces, annoCellType)
vapply(sces, function(sce) table(sce$celltype), integer(5)) vapply(sces, function(sce) table(sce$subtype), integer(4))
Display cell type and subtype annotation for one tumor at a time:
p2a <- plotType(sces[[1]]) p2b <- plotType(sces[[1]], type = "subtype") p2 <- ggarrange(p2b + theme_bw() + theme(plot.margin = margin(r = 1)), p2a + theme_bw() + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank(), plot.margin = margin(l = 1, r=1)), legend = "top", align = "h", labels = c("A", "B")) p2
Facet cell type and subtype visualization for all tumors:
stp <- facetTumors(sces, col = "subtype", pal = stcols) pal <- ggpubr::get_palette("npg", 5) names(pal) <- CELL.TYPES ctp <- facetTumors(sces, col = "celltype", pal = pal) p2 <- ggpubr::ggarrange(stp + theme(axis.text.x = element_blank(), axis.title.x = element_blank()), ctp, nrow = 2, align = "v", labels = c("A", "B"), heights = c(1, 1.2))
Barplot percentage of subtype / cell type for all 10X tumors:
cttab <- vapply(sces, getPerc, numeric(5), perc=FALSE) total <- colSums(cttab) prop.test(cttab["EPI",], total) prop.test(cttab["STROM",], total)
stbp <- bpType(sces, "subtype", pal = stcols) ctbp <- bpType(sces, "celltype", pal = pal)
Cell type vs subtype matrix:
mat <- getCelltypeSubtypeMatrix(sces) ctstbp <- bpCrossType(mat, pal) ctstbp
Or as matrix plot:
mat <- round(mat / sum(mat) * 100) mat <- mat[,CELL.TYPES] matrixPlot(mat, cnames = c("SUBTYPE", "CELL.TYPE"), high.color = "darkgreen")
p1 <- ggarrange(stbp, ctbp + theme(axis.text.y = element_blank(), axis.title.y = element_blank()), # axis.ticks.x = element_blank(), # axis.line.x = element_blank()), ctstbp + theme(axis.text.y = element_blank(), axis.title.y = element_blank()), nrow = 1, hjust = c(-0.5, 0.5, 0.5), widths = c(1.3,1,1), legend = "none", align = "h", labels = c("C", "D", "E")) p1
Margin scores:
spl.margin <- function(sce, col = "celltype") reshape2::melt(split(sce$margin, sce[[col]])) psces <- lapply(sces, spl.margin) for(i in tumors) psces[[match(i, tumors)]]$tumor <- paste0("T", i) df <- do.call(rbind, psces) colnames(df) <- c("MARGIN", "CELL.TYPE", "TUMOR") df[[2]] <- factor(df[[2]], levels = CELL.TYPES)
Boxplot of margin scores:
ggboxplot(df, x = "CELL.TYPE", y = "MARGIN", width = 0.8, notch = TRUE, fill = "CELL.TYPE", palette = get_palette("lancet", 5), facet.by = "TUMOR", nrow = 1, x.text.angle = 45, legend="none", ggtheme = theme_bw(base_size = 12))
Only epithelial cells:
df <- subset(df, CELL.TYPE == "EPI") bp <- ggboxplot(df, x = "TUMOR", y = "MARGIN", width = 0.8, notch = TRUE, fill = "lightgrey", #cb.red, legend="none", ggtheme = theme_bw(), ylab = "MARGIN (EPI)", font.x = 11, font.y = 11, font.tickslab = c(10, "plain", "black")) # ylim = c(0,1),x.text.angle = 45, df.points <- data.frame(x=1:5, y=c(0.376, 0.766, 0.182, 0.646, 0.214)) bp <- bp + geom_point(data = df.points, aes(x = x, y = y), shape = 8, size = 3) + geom_text(data = df.points[2,], aes(x = x, y = y), label = "Bulk", hjust = 0, nudge_x = 0.1, size = 4) bp
durl <- "https://dl.dropboxusercontent.com/s/lit4wok327u1mps" dfile <- file.path(durl, "Bulk_RNAseq_March2020.txt") dat <- read.delim(file(dfile), as.is=TRUE) cpm.ind <- seq_len(which(dat[,1] == "")[1] - 1) cpm.dat <- dat[cpm.ind, 1:8] raw.dat <- dat[,9:16] cpm.sym <- cpm.dat[,1] raw.sym <- raw.dat[,1]
Use pre-computed library size-normalized expression values for subtype assignment:
rownames(cpm.dat) <- cpm.dat[,2] cpm.dat <- cpm.dat[,3:8] colnames(cpm.dat) <- paste0("T", c(59, 77, 76, 89, 90, 91)) cpm.dat <- as.matrix(cpm.dat) mode(cpm.dat) <- "numeric" cpm.se <- SummarizedExperiment(assays = list(cpm = cpm.dat)) cpm.se <- EnrichmentBrowser::idMap(cpm.se, org="hsa", from="ENSEMBL", to="ENTREZID") cpm.file <- file.path(data.dir, "bulk_cpm_subtypes.rds")
cst <- consensusOV::get.consensus.subtypes(assay(cpm.se), names(cpm.se)) saveRDS(cst, file = cpm.file)
cst <- readRDS(cpm.file) cst
Sanity check: start from raw read counts
rownames(raw.dat) <- raw.dat[,2] raw.dat <- raw.dat[,3:8] colnames(raw.dat) <- paste0("T", c(59, 77, 76, 89, 90, 91)) raw.dat <- as.matrix(raw.dat) mode(raw.dat) <- "numeric" raw.dat <- edgeR::cpm(raw.dat, log=TRUE) raw.se <- SummarizedExperiment(assays = list(raw = raw.dat)) raw.se <- EnrichmentBrowser::idMap(raw.se, org="hsa", from="ENSEMBL", to="ENTREZID") raw.file <- file.path(data.dir, "bulk_raw_subtypes.rds")
cst2 <- consensusOV::get.consensus.subtypes(assay(raw.se), names(raw.se)) saveRDS(cst2, file = raw.file)
Pretty much the same as when starting from already normalized expression values:
cst2 <- readRDS(raw.file) cst2
Contrast bulk and single-cell margins:
mat <- t(cst$rf.probs) rownames(mat) <- sub("_consensus", "", rownames(mat)) mat <- round(mat, digits = 2) mat <- mat[c(3,4,1,2),c(1,3,2,4)]
mat <- rbind(c(0.15, 0.02, 0.5, 0.13, 0.12), c(0.59, 0.03, 0.14, 0.77, 0.07), c(0.22, 0.85, 0.03, 0.08, 0.3), c(0.04, 0.09, 0.32, 0.02, 0.52)) colnames(mat) <- paste0("T", tumors) rownames(mat) <- SUBTYPES mat <- mat[4:1,] mp <- matrixPlot(t(mat), cnames = c("TUMORS", "CLASS.PROB"), bw.thresh = 0.5, gthresh = 0.2) mp <- mp + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.line.x = element_blank(), plot.margin = margin(b = 0)) mp <- mp + ylab("BULK") p0 <- ggarrange(mp, bp, vjust = 0.5, hjust = 0.5, nrow = 2, align = "v", heights = c(1,3), labels = c("F", "")) p0 p3 <- ggarrange(p1, p0, nrow = 1, widths = c(2,1)) ggarrange(p2, p3, heights = c(2,1), nrow = 2)
Prepare input files from a SingleCellExperiment
obj <- generateInferCNVInput(sces[[1]], "tumor59")
Run inferCNV
for one tumor at a time (computational intensive, several hours).
Best executed on a cluster (here with 20 cores).
# cutoff = 1 works well for Smart-seq2, # cutoff = 0.1 works well for 10x Genomics res <- infercnv::run(obj, cutoff = 0.1, out_dir = "tumor59", num_threads = 20, denoise = TRUE, HMM = TRUE)
for(i in seq_along(sces)) colnames(sces[[i]]) <- sces[[i]]$Barcode
Read observation matrix (expression profiles of cancer epithelial cells)
tags <- c("p7r313prargcp3g", "6yunw77zfe4nzgw", "qaop7h1sh2bxm5g", "x469lcpm3rp5dxo", "m56zm2zywjcongz") durl <- "https://dl.dropboxusercontent.com/s/tag/TXX_infercnv.observations.rds"
readObsMat <- function(i) { dl <- sub("XX", tumors[i], durl) dl <- sub("tag", tags[i], dl) readRDS(file(dl)) }
obs.mats <- lapply(1:5, readObsMat) names(obs.mats) <- paste0("T", tumors)
Subset and annotate cells:
for(i in seq_along(tumors)) obs.mats[[i]] <- subsetObservations(obs.mats[[i]], sces[[i]]) ra <- getCellAnnotation(sces, obs.mats)
Create union of genes featured in the inferCNV output of each tumor, extend individual observations as needed, and create a combined observation matrix storing the information across all tumors:
un <- Reduce(union, lapply(obs.mats, rownames)) obs.mats <- lapply(obs.mats, extendObservations, un = un) obs.mat <- do.call(cbind, lapply(obs.mats, function(o) o[un,]))
Obtain genomic coordinates for the genes in the combined observation matrix:
egenes <- symbols2ranges(rownames(obs.mat)) obs.mat <- obs.mat[names(egenes),] chr <- as.character(seqnames(egenes))
Highlight specific recurrent regions with known driver genes from the TCGA OVC paper:
clabs <- getRecurrentDriverGenes(rownames(obs.mat))
Plot the heatmap:
im <- ComplexHeatmap::Heatmap(t(obs.mat), #top_annotation = ta, right_annotation = ra, name = "Expression", col = circlize::colorRamp2(#c(0.895, 1.105), c("white", "black")), c(0.895, 1, 1.105), c("blue", "white", "red")), row_title_rot = 0, column_title_rot = 90, show_row_names = FALSE, show_column_names = TRUE, cluster_rows = FALSE, cluster_columns = FALSE, row_split = rep(paste0("T", tumors), sapply(obs.mats, ncol)), column_spl = factor(chr, levels = unique(chr)), column_gap = unit(0, "mm"), column_title_gp = gpar(fontsize = 6), column_labels = clabs, column_names_gp = gpar(fontsize = 6), border = TRUE) im <- grid.grabExpr(draw(im)) im
for(i in seq_along(sces)) colnames(sces[[i]]) <- NULL sce59 <- subsetByCellType(sces[[1]], cell.type = "EPI", subtype = c("DIF", "PRO")) sce59
cyclin.genes <- grep("^CCN[ABDE][0-9]$", names(sce59), value = TRUE) cyclin.genes markerHeatmap(sce59, cyclin.genes) bpCyclins(sce59)
sce77 <- subsetByCellType(sces[[3]], cell.type = "EPI", subtype = c("DIF", "PRO")) bpCyclins(sce77)
sce90 <- subsetByCellType(sces[[5]], cell.type = "EPI", subtype = c("DIF", "PRO")) bpCyclins(sce90)
markers59 <- scran::findMarkers(sce59, groups = sce59$subtype) head(markers59$DIF) markers59$DIF["CCND1",] head(markers59$DIF[markers59$DIF$logFC.PRO < 0,]) markerHeatmap(sce59, rownames(markers59$DIF)[1:25])
Contrast with bulk markers
data.dir <- system.file("extdata", package = "subtypeHeterogeneity") verhaak.file <- file.path(data.dir, "Verhaak_supplementT8A.txt") bulk.markers <- getExtendedVerhaakSignature(verhaak.file, nr.genes.per.subtype = 200, by.subtype = TRUE) bmarkers59 <- compileMarkers(markers59, bulk.markers) bmarkers59 markerHeatmap(sce59, rownames(bmarkers59), row.split = TRUE)
markers77 <- scran::findMarkers(sce77, groups = sce77$subtype) markerHeatmap(sce77, rownames(markers77$DIF)[1:25]) bmarkers77 <- compileMarkers(markers77, bulk.markers) markerHeatmap(sce77, rownames(bmarkers77), row.split = TRUE)
markers90 <- scran::findMarkers(sce90, groups = sce90$subtype) markerHeatmap(sce90, rownames(markers90$DIF)[1:25]) bmarkers90 <- compileMarkers(markers90, bulk.markers) markerHeatmap(sce90, rownames(bmarkers90), row.split = TRUE)
markers <- list(markers59, markers77, markers90) dif.markers <- getCommonMarkers(markers, n = 10) pro.markers <- getCommonMarkers(markers, subtype = "PRO", n = 10) markers <- c(dif.markers, pro.markers)
capture the heatmap
ml <- markerHeatmapList(sces[c(1,3,5)], markers, row.split = TRUE) m <- grid.grabExpr(draw(ml))
Run the cyclone classifier (pre-computed, computationally intensive)
sce.ensembl <- EnrichmentBrowser::idMap(sce, org = "hsa", from = "SYMBOL", to = "ENSEMBL") marker.file <- system.file("extdata", "human_cycle_markers.rds", package="scran") hm.pairs <- readRDS(marker.file) cy.assignments <- scran::cyclone(sce.ensembl, hm.pairs) cy.file <- file.path(data.dir, "T59_cyclone.rds") saveRDS(cy.assignments, file = cy.file)
Get the computed assignments and plot:
cy.file <- file.path(data.dir, "T59_cyclone.rds") cy.assignments <- readRDS(cy.file) plot(cy.assignments$score$G1, cy.assignments$score$G2M, xlab = "G1 score", ylab = "G2/M score", pch = 16) df <- data.frame(cy.assignments$score, subtype = sce59$subtype) ggscatter(df, x = "G1", y = "G2M", color = "subtype", palette = stcols[c("DIF", "PRO")], ggtheme = ggplot2::theme_bw())
tab <- table(cy.assignments$phases, sce59$subtype) tab chisq.test(tab)
Figure 5:
bc <- bpCyclins(sces[[1]], top.only = TRUE) + ylim(c(0,4.3)) bc <- bc + geom_signif(annotations = "1.8e-07", y_position = 4.25, xmin = 4.8, xmax = 5.2) gs <- ggscatter(df, x = "G1", y = "G2M", color = "subtype", palette = stcols[c("DIF", "PRO")], ggtheme = ggplot2::theme_bw(), legend = "none") bc <- ggarrange(bc, gs, ncol = 2, labels = c("B", "C")) bcd <- ggarrange(bc, m, nrow = 2, labels = c("", "D")) bcd
ggarrange(im, bcd, ncol = 2, labels = c("A",""))
cy.file <- sub("59", "77", cy.file) cy.assignments <- readRDS(cy.file) df <- data.frame(cy.assignments$score, subtype = sce77$subtype) ggscatter(df, x = "G1", y = "G2M", color = "subtype", palette = stcols[c("DIF", "PRO")], ggtheme = ggplot2::theme_bw())
tab <- table(cy.assignments$phases, sce77$subtype) tab chisq.test(tab)
cy.file <- sub("77", "90", cy.file) cy.assignments <- readRDS(cy.file) df <- data.frame(cy.assignments$score, subtype = sce90$subtype) ggscatter(df, x = "G1", y = "G2M", color = "subtype", palette = stcols[c("DIF", "PRO")], ggtheme = ggplot2::theme_bw())
tab <- table(cy.assignments$phases, sce90$subtype) tab fisher.test(tab)
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.