knitr::opts_chunk$set(echo = TRUE)
suppressPackageStartupMessages({
    library(DropletUtils)
    library(scater)
    library(scran)
})

1. Reference

https://bioconductor.org/packages/release/workflows/vignettes/simpleSingleCell/inst/doc/umis.html

library(DropletUtils)
library(scater)
library(scran)

options(stringsAsFactors = FALSE)

2. Setting up the data

Reading in a sparse matrix
sample_num = "sample59"
home.dir = "/data/OVCscRNAseq"
dir.name = file.path(home.dir, "data/59")
list.files(dir.name)
sce = read10xCounts(dir.name)
sce
Annotating the rows
rownames(sce) <- uniquifyFeatureNames(rowData(sce)$ID, rowData(sce)$Symbol)
head(rownames(sce))
library(EnsDb.Hsapiens.v86)
location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce)$ID, 
    column="SEQNAME", keytype="GENEID")
rowData(sce)$CHR <- location
summary(location=="MT")

3. Calling cells from emptry droplets

Testing for deviations from ambient expression
bcrank <- barcodeRanks(counts(sce))

# Only showing unique points for plotting speed.
uniq <- !duplicated(bcrank$rank)
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
    xlab="Rank", ylab="Total UMI count", cex.lab=1.2)

abline(h=bcrank$inflection, col="darkgreen", lty=2)
abline(h=bcrank$knee, col="dodgerblue", lty=2)

legend("bottomleft", legend=c("Inflection", "Knee"), 
    col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2)

emptyDrops function wasn't applied, because the suspected 'empty drops' were already removed from the matrix when we received it from Tim.

set.seed(100)
e.out <- emptyDrops(counts(sce))
sum(e.out$FDR <= 0.001, na.rm=TRUE)
# using which() to automatically remove NAs.
sce <- sce[,which(e.out$FDR <= 0.001)]
full.data <- read10xCounts(dir.name, col.names=TRUE)
set.seed(100)
limit <- 100   
all.out <- emptyDrops(counts(full.data), lower=limit, test.ambient=TRUE)
hist(all.out$PValue[all.out$Total <= limit & all.out$Total > 0],
    xlab="P-value", main="", col="grey80") 

4. Quality control on the cells

sce <- calculateQCMetrics(sce, feature_controls=list(Mito=which(location=="MT")))
par(mfrow=c(1,3))

# hist(sce$log10_total_counts, breaks=20, col="grey80",
#     xlab="Log-total UMI count")
# hist(sce$log10_total_features_by_counts, breaks=20, col="grey80",
#     xlab="Log-total number of expressed features")
# hist(sce$pct_counts_Mito, breaks=20, col="grey80",
#     xlab="Proportion of reads in mitochondrial genes")

hist(sce$total_counts/1e3, xlab="Library sizes (thousands)", main="", 
    breaks=20, col="grey80", ylab="Number of cells")
hist(sce$total_features_by_counts, xlab="Number of expressed genes", main="", 
    breaks=20, col="grey80", ylab="Number of cells")
hist(sce$pct_counts_Mito, xlab="Mitochondrial proportion (%)", 
    ylab="Number of cells", breaks=20, main="", col="grey80")

We remove cells with log-library sizes that are more than 1 median absolute deviations (MADs) below the median log-library size. (A log-transformation improves resolution at small values, especially when the MAD of the raw values is comparable to or greater than the median.) We also remove cells where the log-transformed number of expressed genes is 1 MADs below the median. <-- Reference workflow used 3 MADs for libsize.drop and feature.drop, but our sample seemt to be more positvely skewed, so we tried 1 MADs for QC.

high.mito <- isOutlier(sce$pct_counts_Mito, nmads=3, type="higher")
libsize.drop <- isOutlier(sce$total_counts, nmads=1, type="lower", log=TRUE)
feature.drop <- isOutlier(sce$total_features_by_counts, nmads=1, type="lower", log=TRUE)
sce <- sce[,!(high.mito | libsize.drop | feature.drop)]
data.frame(ByHighMito=sum(high.mito),
           ByLibSize=sum(libsize.drop), 
           ByFeature=sum(feature.drop), 
           Remaining=ncol(sce))

5. Examining gene expression

ave <- calcAverage(sce)
hist(log10(ave), breaks=100, main="", col="grey",
    xlab=expression(Log[10]~"average count"))

remove genes that have average counts of zero, as this means that they are not expressed in any cell

rowData(sce)$AveCount <- ave
to.keep = ave > 0
sce = sce[to.keep,]
summary(to.keep)
# takes very long
plotHighestExprs(sce)

6. Normalizing for cell-specific biases

If method="igraph", a shared nearest neighbor graph is constructed using the buildSNNGraph function.

set.seed(1000)
clusters <- quickCluster(sce, method="igraph", min.mean=0.1,
    irlba.args=list(maxit=1000)) # for convergence.
table(clusters)

Normalization by deconvolution

sce <- computeSumFactors(sce, min.mean=0.1, cluster=clusters)
summary(sizeFactors(sce))
plot(sce$total_counts, sizeFactors(sce), log="xy")
sce <- scater::normalize(sce)

7. Modelling the mean-variance trend

new.trend <- makeTechTrend(x=sce)
fit <- trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))
plot(fit$mean, fit$var, pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE)
curve(new.trend(x), col="red", add=TRUE)
fit0 <- fit
fit$trend <- new.trend
dec <- decomposeVar(fit=fit)
top.dec <- dec[order(dec$bio, decreasing=TRUE),] 
head(top.dec)
plotExpression(sce, features=rownames(top.dec)[1:10])

8. Dimensionality reduction

Denoise log-expression data by removing principal components corresponding to technical noise.

set.seed(1000)
sce <- denoisePCA(sce, technical=new.trend, approximate=TRUE)
ncol(reducedDim(sce, "PCA"))
plot(attr(reducedDim(sce), "percentVar"), xlab="PC",
    ylab="Proportion of variance explained")
abline(v=ncol(reducedDim(sce, "PCA")), lty=2, col="red")
plotPCA(sce, ncomponents=3, colour_by="log10_total_features_by_counts")
set.seed(100)
sce <- runTSNE(sce, use_dimred="PCA", perplexity=30)
plotTSNE(sce, colour_by="log10_total_features_by_counts")

9. Clustering with graph-based methods

snn.gr <- buildSNNGraph(sce, use.dimred="PCA")   # make 20 clusters
clusters <- igraph::cluster_walktrap(snn.gr)
sce$Cluster <- factor(clusters$membership)
table(sce$Cluster)
k = 20
snn.gr <- buildSNNGraph(sce, use.dimred="PCA", k = k)   # make 6 clusters
clusters <- igraph::cluster_walktrap(snn.gr)
sce$Cluster <- factor(clusters$membership)
table(sce$Cluster)
cluster.mod <- clusterModularity(snn.gr, sce$Cluster, get.values=TRUE)
log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1)

library(pheatmap)
pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, 
    color=colorRampPalette(c("white", "blue"))(100))
plotTSNE(sce, colour_by="Cluster")

10. Marker gene detection

markers <- findMarkers(sce, clusters=sce$Cluster, direction="up")
marker.set <- markers[["8"]]
head(marker.set[,1:8], 10) # only first 8 columns, for brevity
chosen <- rownames(marker.set)[marker.set$Top <= 10]
plotHeatmap(sce, features=chosen, exprs_values="logcounts", 
    zlim=5, center=TRUE, symmetric=TRUE, cluster_cols=FALSE,
    # colour_columns_by="Cluster",
    columns=order(sce$Cluster),
    show_colnames=FALSE)

Adopted from Shih et al., PLoS One (2018)

epithelial = c('EPCAM', 'KRT8', 'KRT18', 'KRT19')
lymphocyte = c('PTPRC', 'CD3E', 'CD19', 'MS4A1')
endothelial = c('PECAM1', 'CD34')
fibroblast = c('ACTA2', 'DCN', 'ACTB')
stromal = c('THY1', 'ENG', 'VIM', 'CD44')

fontsize <- theme(axis.text=element_text(size=8), axis.title=element_text(size=10))

Epithelial marker expression

ep_1 <- plotTSNE(sce, colour_by=epithelial[1]) + fontsize
ep_2 <- plotTSNE(sce, colour_by=epithelial[2]) + fontsize
ep_3 <- plotTSNE(sce, colour_by=epithelial[3]) + fontsize
ep_4 <- plotTSNE(sce, colour_by=epithelial[4]) + fontsize
multiplot(ep_1, ep_3, ep_2, ep_4, cols=2)

Lympocyte marker expression

ly_1 <- plotTSNE(sce, colour_by=lymphocyte[1]) + fontsize
ly_2 <- plotTSNE(sce, colour_by=lymphocyte[2]) + fontsize
ly_3 <- plotTSNE(sce, colour_by=lymphocyte[3]) + fontsize
ly_4 <- plotTSNE(sce, colour_by=lymphocyte[4]) + fontsize
multiplot(ly_1, ly_3, ly_2, ly_4, cols=2)

Endothelial marker expression

en_1 <- plotTSNE(sce, colour_by=endothelial[1]) + fontsize
en_2 <- plotTSNE(sce, colour_by=endothelial[2]) + fontsize
multiplot(en_1, en_2, cols=2)

Fibroblast marker expression

fb_1 <- plotTSNE(sce, colour_by=fibroblast[1]) + fontsize
fb_2 <- plotTSNE(sce, colour_by=fibroblast[2]) + fontsize
fb_3 <- plotTSNE(sce, colour_by=fibroblast[3]) + fontsize
multiplot(fb_1, fb_2, fb_3, cols=2)

Stromal marker expression

st_1 <- plotTSNE(sce, colour_by=stromal[1]) + fontsize
st_2 <- plotTSNE(sce, colour_by=stromal[2]) + fontsize
st_3 <- plotTSNE(sce, colour_by=stromal[3]) + fontsize
st_4 <- plotTSNE(sce, colour_by=stromal[4]) + fontsize
multiplot(st_1, st_3, st_2, st_4, cols=2)

Ovarian cancer marker expression

epithelial_marker = c('WFDC2', 'CLDN4', 'FXYD3', 'CD24', 'ELF3', 'CLDN3', 'MUC1', 'SPINT2', 'KRT8', 'SLPI', 'KRT18', 'KRT19')

ov_1 <- plotTSNE(sce, colour_by=epithelial_marker[1]) + fontsize
ov_2 <- plotTSNE(sce, colour_by=epithelial_marker[2]) + fontsize
ov_3 <- plotTSNE(sce, colour_by=epithelial_marker[3]) + fontsize
ov_4 <- plotTSNE(sce, colour_by=epithelial_marker[4]) + fontsize
ov_5 <- plotTSNE(sce, colour_by=epithelial_marker[5]) + fontsize
ov_6 <- plotTSNE(sce, colour_by=epithelial_marker[6]) + fontsize
ov_7 <- plotTSNE(sce, colour_by=epithelial_marker[7]) + fontsize
ov_8 <- plotTSNE(sce, colour_by=epithelial_marker[8]) + fontsize
ov_9 <- plotTSNE(sce, colour_by=epithelial_marker[9]) + fontsize
ov_10 <- plotTSNE(sce, colour_by=epithelial_marker[10]) + fontsize
ov_11 <- plotTSNE(sce, colour_by=epithelial_marker[11]) + fontsize
ov_12 <- plotTSNE(sce, colour_by=epithelial_marker[12]) + fontsize
multiplot(ov_1, ov_5, ov_9, ov_2, ov_6, ov_10, ov_3, ov_7, ov_11, ov_4, ov_8, ov_12, cols=4)

Cell origin marker expression

cellOrigin = c("PAX8", "CALB2", "COL11A1")
ft = plotTSNE(sce, colour_by=cellOrigin[1]) + fontsize
ose_1 = plotTSNE(sce, colour_by=cellOrigin[2]) + fontsize
ose_2 = plotTSNE(sce, colour_by=cellOrigin[3]) + fontsize
multiplot(ft, ose_1, ose_2, cols=2)
sample_fname = paste0("scater/data/", sample_num,"_sce_", ncol(sce), ".rda")
saveRDS(sce, file = file.path(home.dir, sample_fname))


waldronlab/subtypeHeterogeneity documentation built on Oct. 28, 2020, 9:14 a.m.