Below, we summarize the process of generating the data sets and clustering results provided in this package. More information is available from the GitHub repository corresponding to the publication (Duò et al., F1000Research 7:1141 (2018)): https://github.com/markrobinsonuzh/scRNAseq_clustering_comparison.
First, we download the raw data files, either from the conquer repository http://imlspenticton.uzh.ch:3838/conquer/ or from the 10x Genomics website https://support.10xgenomics.com/single-cell-gene-expression/datasets.
```{bash, eval=FALSE} wget -P data/data_raw http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE60749-GPL13112.rds wget -P data/data_raw http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE52529-GPL16791.rds wget -P data/data_raw http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/SRP073808.rds wget -O data/data_raw/SRP073808TCC.rds http://imlspenticton.uzh.ch/robinson_lab/conquer/data-tcc/SRP073808.rds wget -O data/data_raw/GSE52529-GPL16791TCC.rds http://imlspenticton.uzh.ch/robinson_lab/conquer/data-tcc/GSE52529-GPL16791.rds wget -O data/data_raw/GSE60749-GPL13112TCC.rds http://imlspenticton.uzh.ch/robinson_lab/conquer/data-tcc/GSE60749-GPL13112.rds
for D in b_cells naive_cytotoxic cd14_monocytes regulatory_t cd56_nk memory_t cd4_t_helper naive_t; do \ mkdir -p data/data_raw/zheng/${D}; \ wget -P data/data_raw/zheng http://cf.10xgenomics.com/samples/cell-exp/1.1.0/${D}/${D}_filtered_gene_bc_matrices.tar.gz; \ tar zxvf data/data_raw/zheng/${D}_filtered_gene_bc_matrices.tar.gz -C data/data_raw/zheng/${D}; \ rm -f data/data_raw/zheng/${D}_filtered_gene_bc_matrices.tar.gz; \ done
Next, we run each `import_QC_*.Rmd` file to generate the "full" data sets (retaining all genes with at least 1 read in at least one of the good-quality cells). From the "full" data sets, we then generate the filtered data sets using the following wrapper code: ```r suppressPackageStartupMessages({ library(SingleCellExperiment) library(scater) library(Seurat) library(scran) library(M3Drop) }) sce <- readRDS(scefile) sce_filt <- get(paste0("filter", method))(sce = sce, pctkeep = pctkeep) sce_filt <- calculateQCMetrics(sce_filt) sce_filt <- computeSumFactors(sce_filt, sizes = pmin(ncol(sce_filt), seq(20, 120, 20)), min.mean = 0.5) table(sizeFactors(sce_filt) < 0) sce_filt <- sce_filt[, sizeFactors(sce_filt) > 0] sce_filt <- normalise(sce_filt, exprs_values = "counts", return_log = TRUE, return_norm_as_exprs = TRUE) sce_filt <- normalise(sce_filt, exprs_values = "counts", return_log = FALSE, return_norm_as_exprs = FALSE) sce_filt <- runTSNE(sce_filt, exprs_values = "logcounts", perplexity = 10) saveRDS(sce_filt, file = outrds)
We use three filtering methods (indicated by method
in the code chunk above):
Expr
, HVG
and M3Drop
. The percentage of genes to keep (pctkeep
) is
always 10.
## Expr filtering filterExpr <- function(sce, pctkeep) { exprsn <- rowMeans(logcounts(sce)) keep <- order(exprsn, decreasing = TRUE)[seq_len(pctkeep/100*length(exprsn))] sce[keep, ] } ## HVG filtering filterHVG <- function(sce, pctkeep) { seurat <- CreateSeuratObject(raw.data = counts(sce), meta.data = as.data.frame(colData(sce)), min.cells = 0, min.genes = 0, project = "scRNAseq") seurat <- NormalizeData(seurat) seurat <- ScaleData(seurat, vars.to.regress = "nUMI", display.progress = FALSE) seurat <- FindVariableGenes(seurat, do.plot = FALSE) keep <- rownames(head(seurat@hvg.info, n = round(pctkeep/100*nrow(sce)))) sce[keep, ] } ## M3Drop filtering filterM3Drop <- function(sce, pctkeep) { cpm <- calculateCPM(sce) cpm <- cpm[rowMeans(cpm) > 0.05, ] res <- M3DropDifferentialExpression(expr_mat = cpm, mt_method = "none", mt_threshold = 1, suppress.plot = TRUE) res <- res[order(res$p.value), ] stopifnot(nrow(res) > round(pctkeep/100*nrow(sce))) keep <- rownames(head(res, n = round(pctkeep/100*nrow(sce)))) sce[keep, ] }
The clustering results were obtained by running a number of clustering
algorithms on each of the data sets generated above. The details can be found in
the GitHub repository corresponding to the publication (Duò et al.,
F1000Research 7:1141 (2018)):
https://github.com/markrobinsonuzh/scRNAseq_clustering_comparison.
The vignette run_clustering.Rmd
contains the wrapper script that we used to
apply each clustering function to each data set. It is also reproduced here. The
specific clustering functions (called apply_*()
) can be found in the GitHub
repository of the publication. The parameter settings are included in summarized
form in this package (duo_clustering_all_parameter_settings_v1()
).
print(scefile) ## Name of .rds file containing a SingleCellExperiment object print(method) ## Clustering method print(outrds) ## Name of .rds file where results will be written suppressPackageStartupMessages({ library(rjson) library(SingleCellExperiment) }) source(paste0("Rscripts/clustering/apply_", method, ".R")) ## Load parameter files. General dataset and method parameters as well as ## dataset/method-specific parameters params <- c(fromJSON(file = paste0("parameter_settings/", gsub("\\.rds$", "_", basename(scefile)), method, ".json")), fromJSON(file = paste0("parameter_settings/", method, ".json")), fromJSON(file = paste0("parameter_settings/", gsub("\\.rds$", ".json", basename(scefile)))) ) ## If any parameter is repeated, take the most specific if (any(duplicated(names(params)))) { warning("Possibly conflicting settings") params <- params[!duplicated(names(params))] } print(params) ## Set number of times to run clustering for each k n_rep <- 5 ## Read data sce <- readRDS(scefile) ## Run clustering set.seed(1234) L <- lapply(seq_len(n_rep), function(i) { ## For each run cat(paste0("run = ", i, "\n")) if (method == "Seurat") { tmp <- lapply(params$range_resolutions, function(resolution) { ## For each resolution cat(paste0("resolution = ", resolution, "\n")) ## Run clustering res <- get(paste0("apply_", method))(sce = sce, params = params, resolution = resolution) ## Put output in data frame df <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), method = method, cell = names(res$cluster), run = i, k = length(unique(res$cluster)), resolution = resolution, cluster = res$cluster, stringsAsFactors = FALSE, row.names = NULL) tm <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), method = method, run = i, k = length(unique(res$cluster)), resolution = resolution, user.self = res$st[["user.self"]], sys.self = res$st[["sys.self"]], user.child = res$st[["user.child"]], sys.child = res$st[["sys.child"]], elapsed = res$st[["elapsed"]], stringsAsFactors = FALSE, row.names = NULL) kest <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), method = method, run = i, k = length(unique(res$cluster)), resolution = resolution, est_k = res$est_k, stringsAsFactors = FALSE, row.names = NULL) list(clusters = df, timing = tm, kest = kest) }) ## End for each resolution } else { tmp <- lapply(params$range_clusters, function(k) { ## For each k cat(paste0("k = ", k, "\n")) ## Run clustering res <- get(paste0("apply_", method))(sce = sce, params = params, k = k) ## Put output in data frame df <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), method = method, cell = names(res$cluster), run = i, k = k, resolution = NA, cluster = res$cluster, stringsAsFactors = FALSE, row.names = NULL) tm <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), method = method, run = i, k = k, resolution = NA, user.self = res$st[["user.self"]], sys.self = res$st[["sys.self"]], user.child = res$st[["user.child"]], sys.child = res$st[["sys.child"]], elapsed = res$st[["elapsed"]], stringsAsFactors = FALSE, row.names = NULL) kest <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), method = method, run = i, k = k, resolution = NA, est_k = res$est_k, stringsAsFactors = FALSE, row.names = NULL) list(clusters = df, timing = tm, kest = kest) }) ## End for each k } ## Summarize across different values of k assignments <- do.call(rbind, lapply(tmp, function(w) w$clusters)) timings <- do.call(rbind, lapply(tmp, function(w) w$timing)) k_estimates <- do.call(rbind, lapply(tmp, function(w) w$kest)) list(assignments = assignments, timings = timings, k_estimates = k_estimates) }) ## End for each run ## Summarize across different runs assignments <- do.call(rbind, lapply(L, function(w) w$assignments)) timings <- do.call(rbind, lapply(L, function(w) w$timings)) k_estimates <- do.call(rbind, lapply(L, function(w) w$k_estimates)) ## Add true group for each cell truth <- data.frame(cell = as.character(rownames(colData(sce))), trueclass = as.character(colData(sce)$phenoid), stringsAsFactors = FALSE) assignments$trueclass <- truth$trueclass[match(assignments$cell, truth$cell)] ## Save results saveRDS(list(assignments = assignments, timings = timings, k_estimates = k_estimates), file = outrds)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.