Below, we summarize the process of generating the data sets and clustering results provided in this package. More information is available from the GitHub repository corresponding to the publication (Duò et al., F1000Research 7:1141 (2018)): https://github.com/markrobinsonuzh/scRNAseq_clustering_comparison.

Generating the data sets

First, we download the raw data files, either from the conquer repository http://imlspenticton.uzh.ch:3838/conquer/ or from the 10x Genomics website https://support.10xgenomics.com/single-cell-gene-expression/datasets.

```{bash, eval=FALSE} wget -P data/data_raw http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE60749-GPL13112.rds wget -P data/data_raw http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE52529-GPL16791.rds wget -P data/data_raw http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/SRP073808.rds wget -O data/data_raw/SRP073808TCC.rds http://imlspenticton.uzh.ch/robinson_lab/conquer/data-tcc/SRP073808.rds wget -O data/data_raw/GSE52529-GPL16791TCC.rds http://imlspenticton.uzh.ch/robinson_lab/conquer/data-tcc/GSE52529-GPL16791.rds wget -O data/data_raw/GSE60749-GPL13112TCC.rds http://imlspenticton.uzh.ch/robinson_lab/conquer/data-tcc/GSE60749-GPL13112.rds

for D in b_cells naive_cytotoxic cd14_monocytes regulatory_t cd56_nk memory_t cd4_t_helper naive_t; do \ mkdir -p data/data_raw/zheng/${D}; \ wget -P data/data_raw/zheng http://cf.10xgenomics.com/samples/cell-exp/1.1.0/${D}/${D}_filtered_gene_bc_matrices.tar.gz; \ tar zxvf data/data_raw/zheng/${D}_filtered_gene_bc_matrices.tar.gz -C data/data_raw/zheng/${D}; \ rm -f data/data_raw/zheng/${D}_filtered_gene_bc_matrices.tar.gz; \ done

Next, we run each `import_QC_*.Rmd` file to generate the "full" data sets
(retaining all genes with at least 1 read in at least one of the good-quality
cells). From the "full" data sets, we then generate the filtered data sets using
the following wrapper code:

```r
suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(scater)
  library(Seurat)
  library(scran)
  library(M3Drop)
})

sce <- readRDS(scefile)

sce_filt <- get(paste0("filter", method))(sce = sce, pctkeep = pctkeep)

sce_filt <- calculateQCMetrics(sce_filt)
sce_filt <- computeSumFactors(sce_filt, sizes = pmin(ncol(sce_filt), 
                                                     seq(20, 120, 20)), 
                              min.mean = 0.5)
table(sizeFactors(sce_filt) < 0)
sce_filt <- sce_filt[, sizeFactors(sce_filt) > 0]
sce_filt <- normalise(sce_filt, exprs_values = "counts", return_log = TRUE, 
                      return_norm_as_exprs = TRUE)
sce_filt <- normalise(sce_filt, exprs_values = "counts", return_log = FALSE, 
                      return_norm_as_exprs = FALSE)
sce_filt <- runTSNE(sce_filt, exprs_values = "logcounts", perplexity = 10)

saveRDS(sce_filt, file = outrds)

We use three filtering methods (indicated by method in the code chunk above): Expr, HVG and M3Drop. The percentage of genes to keep (pctkeep) is always 10.

## Expr filtering
filterExpr <- function(sce, pctkeep) {
  exprsn <- rowMeans(logcounts(sce))
  keep <- order(exprsn, decreasing = TRUE)[seq_len(pctkeep/100*length(exprsn))]
  sce[keep, ]
}

## HVG filtering
filterHVG <- function(sce, pctkeep) {
  seurat <- CreateSeuratObject(raw.data = counts(sce), 
                               meta.data = as.data.frame(colData(sce)),
                               min.cells = 0, min.genes = 0, 
                               project = "scRNAseq")
  seurat <- NormalizeData(seurat)
  seurat <- ScaleData(seurat, vars.to.regress = "nUMI", 
                      display.progress = FALSE)
  seurat <- FindVariableGenes(seurat, do.plot = FALSE)
  keep <- rownames(head(seurat@hvg.info, n = round(pctkeep/100*nrow(sce))))
  sce[keep, ]
}

## M3Drop filtering
filterM3Drop <- function(sce, pctkeep) {
  cpm <- calculateCPM(sce)
  cpm <- cpm[rowMeans(cpm) > 0.05, ]
  res <- M3DropDifferentialExpression(expr_mat = cpm, 
                                      mt_method = "none", mt_threshold = 1, 
                                      suppress.plot = TRUE)
  res <- res[order(res$p.value), ]
  stopifnot(nrow(res) > round(pctkeep/100*nrow(sce)))
  keep <- rownames(head(res, n = round(pctkeep/100*nrow(sce))))
  sce[keep, ]
}

Running the clustering algorithms

The clustering results were obtained by running a number of clustering algorithms on each of the data sets generated above. The details can be found in the GitHub repository corresponding to the publication (Duò et al., F1000Research 7:1141 (2018)): https://github.com/markrobinsonuzh/scRNAseq_clustering_comparison. The vignette run_clustering.Rmd contains the wrapper script that we used to apply each clustering function to each data set. It is also reproduced here. The specific clustering functions (called apply_*()) can be found in the GitHub repository of the publication. The parameter settings are included in summarized form in this package (duo_clustering_all_parameter_settings_v1()).

print(scefile)  ## Name of .rds file containing a SingleCellExperiment object
print(method)  ## Clustering method
print(outrds)  ## Name of .rds file where results will be written

suppressPackageStartupMessages({
  library(rjson)
  library(SingleCellExperiment)
})
source(paste0("Rscripts/clustering/apply_", method, ".R"))

## Load parameter files. General dataset and method parameters as well as
## dataset/method-specific parameters
params <- c(fromJSON(file = paste0("parameter_settings/", 
                                   gsub("\\.rds$", "_", basename(scefile)), 
                                   method, ".json")),
            fromJSON(file = paste0("parameter_settings/", method, ".json")), 
            fromJSON(file = paste0("parameter_settings/", 
                                   gsub("\\.rds$", ".json", basename(scefile))))
            )
## If any parameter is repeated, take the most specific
if (any(duplicated(names(params)))) {
  warning("Possibly conflicting settings")
  params <- params[!duplicated(names(params))]
}
print(params)

## Set number of times to run clustering for each k
n_rep <- 5

## Read data
sce <- readRDS(scefile)

## Run clustering
set.seed(1234)
L <- lapply(seq_len(n_rep), function(i) {  ## For each run
  cat(paste0("run = ", i, "\n"))
  if (method == "Seurat") {
    tmp <- lapply(params$range_resolutions, function(resolution) {  ## For each resolution
      cat(paste0("resolution = ", resolution, "\n"))
      ## Run clustering
      res <- get(paste0("apply_", method))(sce = sce, params = params, 
                                           resolution = resolution)

      ## Put output in data frame
      df <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), 
                       method = method, 
                       cell = names(res$cluster),
                       run = i,
                       k = length(unique(res$cluster)),
                       resolution = resolution,
                       cluster = res$cluster,
                       stringsAsFactors = FALSE, row.names = NULL)
      tm <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), 
                       method = method,
                       run = i, 
                       k = length(unique(res$cluster)),
                       resolution = resolution,
                       user.self = res$st[["user.self"]],
                       sys.self = res$st[["sys.self"]],
                       user.child = res$st[["user.child"]],
                       sys.child = res$st[["sys.child"]],
                       elapsed = res$st[["elapsed"]],
                       stringsAsFactors = FALSE, row.names = NULL)
      kest <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), 
                         method = method,
                         run = i, 
                         k = length(unique(res$cluster)),
                         resolution = resolution,
                         est_k = res$est_k,
                         stringsAsFactors = FALSE, row.names = NULL)
      list(clusters = df, timing = tm, kest = kest)
    })  ## End for each resolution
  } else {
    tmp <- lapply(params$range_clusters, function(k) {  ## For each k
      cat(paste0("k = ", k, "\n"))
      ## Run clustering
      res <- get(paste0("apply_", method))(sce = sce, params = params, k = k)

      ## Put output in data frame
      df <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), 
                       method = method, 
                       cell = names(res$cluster),
                       run = i,
                       k = k,
                       resolution = NA,
                       cluster = res$cluster,
                       stringsAsFactors = FALSE, row.names = NULL)
      tm <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), 
                       method = method,
                       run = i, 
                       k = k,
                       resolution = NA,
                       user.self = res$st[["user.self"]],
                       sys.self = res$st[["sys.self"]],
                       user.child = res$st[["user.child"]],
                       sys.child = res$st[["sys.child"]],
                       elapsed = res$st[["elapsed"]],
                       stringsAsFactors = FALSE, row.names = NULL)
      kest <- data.frame(dataset = gsub("\\.rds$", "", basename(scefile)), 
                         method = method,
                         run = i, 
                         k = k,
                         resolution = NA,
                         est_k = res$est_k,
                         stringsAsFactors = FALSE, row.names = NULL)
      list(clusters = df, timing = tm, kest = kest)
    })  ## End for each k
  }

  ## Summarize across different values of k
  assignments <- do.call(rbind, lapply(tmp, function(w) w$clusters))
  timings <- do.call(rbind, lapply(tmp, function(w) w$timing))
  k_estimates <- do.call(rbind, lapply(tmp, function(w) w$kest))
  list(assignments = assignments, timings = timings, k_estimates = k_estimates)
})  ## End for each run

## Summarize across different runs
assignments <- do.call(rbind, lapply(L, function(w) w$assignments))
timings <- do.call(rbind, lapply(L, function(w) w$timings))
k_estimates <- do.call(rbind, lapply(L, function(w) w$k_estimates))

## Add true group for each cell
truth <- data.frame(cell = as.character(rownames(colData(sce))),
                    trueclass = as.character(colData(sce)$phenoid),
                    stringsAsFactors = FALSE)
assignments$trueclass <- truth$trueclass[match(assignments$cell, truth$cell)]

## Save results
saveRDS(list(assignments = assignments, timings = timings,
             k_estimates = k_estimates), file = outrds)


csoneson/DuoClustering2018 documentation built on May 18, 2024, 7:13 a.m.