Introduction.R
In CellBench: Construct Benchmarks for Single Cell Analysis Methods

## ---- include = FALSE---------------------------------------------------------
library(CellBench)
library(limma)
library(dplyr)
library(purrr)

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.retina = 1
)

## -----------------------------------------------------------------------------
library(CellBench)
library(limma)

datasets <- list(
    sample_10x = readRDS(cellbench_file("10x_sce_sample.rds"))
)

norm_method <- list(
    none = counts,
    cpm = function(x) t(t(1e6 * counts(x)) / colSums(counts(x)))
)

transform <- list(
    none = identity,
    log2 = function(x) log2(x+1)
)

## -----------------------------------------------------------------------------
res1 <- datasets %>%
    apply_methods(norm_method)

res1

## -----------------------------------------------------------------------------
res2 <- res1 %>%
    apply_methods(transform)

res2

## -----------------------------------------------------------------------------
# generate colour values from cell line information
cell_line <- factor(colData(datasets$sample_10x)$cell_line)
cell_line_col <- c("red", "blue", "green")[cell_line]

collapsed_res <- res2 %>%
    pipeline_collapse()

collapsed_res

## ---- fig.height = 9, fig.width = 8-------------------------------------------
par(mfrow = c(2, 2)) # declare 2x2 plotting grid

# loop through row of summarised pipeline table
for (i in 1:nrow(collapsed_res)) {
    title <- collapsed_res$pipeline[i]
    expr_mat <- collapsed_res$result[[i]] # note the use of [[]] due to list
    
    limma::plotMDS(
        expr_mat,
        main = title,
        pch = 19, # draw circles rather than label name
        col = cell_line_col
    )
}

par(mfrow = c(1, 1)) # undo plotting grid for future plots

## ---- eval = FALSE------------------------------------------------------------
#  # loading the individual sets of data
#  sc_data <- load_sc_data()
#  mrna_mix_data <- load_mrna_mix_data()
#  cell_mix_data <- load_cell_mix_data()
#  
#  # loading all datasets
#  all_data <- load_all_data()

## ---- eval = FALSE------------------------------------------------------------
#  # removes all locally cached CellBench datasets
#  clear_cached_datasets()

## ---- eval = FALSE------------------------------------------------------------
#  # the following two statements are equivalent
#  f(x)
#  x %>% f()
#  
#  # as are these
#  f(x, y)
#  x %>% f(y)
#  
#  # and these
#  h(g(f(x)))
#  x %>% f() %>% g() %>% h()
#  
#  # or these
#  h(g(f(x, a), b), c)
#  x %>% f(a) %>% g(b) %>% h(c)

## -----------------------------------------------------------------------------
x <- list(
    a = 1,
    b = 2,
    c = 3
)

lapply(x, sqrt)

## ---- result = 'hide'---------------------------------------------------------
sample_10x <- readRDS(cellbench_file("10x_sce_sample.rds"))

# even with a single dataset we need to construct a list
datasets <- list(
    sample_10x = sample_10x
)

# we can add more datasets to the pipeline by adding to the list
# here we have two datasets that are random samplings of the genes in the 10x
# sample data
datasets <- list(
    subsample1_10x = sample_genes(sample_10x, n = 1000),
    subsample2_10x = sample_genes(sample_10x, n = 1000)
)

# could have been any other kind of object as long as they are consistent
datasets <- list(
    set1 = matrix(rnorm(500, mean = 2, sd = 1), ncol = 5, nrow = 10),
    set2 = matrix(rnorm(500, mean = 2, sd = 1), ncol = 5, nrow = 10)
)

## ---- result = 'hide'---------------------------------------------------------
# counts is a function that can be run with counts(x) here it is named 
# "none" as it denotes the lack of normalisation
norm_method <- list(
    none = counts,
    cpm = function(x) t(t(1e6 * counts(x)) / colSums(counts(x)))
)

# "identity" is a useful function that simply returns its input 
# it allows the comparison between applying and not applying a method
transform <- list(
    none = identity,
    log2 = function(x) log2(x+1)
)

## ---- result = 'hide'---------------------------------------------------------
# using anonymous function wrappers
metric <- list(
    mean = function(x) { mean(x, na.rm = TRUE) },
    sd = function(x) { sd(x, na.rm = TRUE) }
)

# using purrr partial function
partial <- purrr::partial # explicit namespacing to avoid ambiguity
metric <- list(
    mean = partial(mean, na.rm = TRUE),
    sd = partial(sd, na.rm = TRUE)
)

# example use with kmeans
clustering <- list(
    kmeans_4 = partial(kmeans, centers = 4),
    kmeans_5 = partial(kmeans, centers = 5),
    kmeans_6 = partial(kmeans, centers = 6)
)

## -----------------------------------------------------------------------------
class(res2)

## -----------------------------------------------------------------------------
res2 %>% dplyr::filter(norm_method == "cpm")

## -----------------------------------------------------------------------------
# datasets
datasets <- list(
    sample_10x = readRDS(cellbench_file("10x_sce_sample.rds"))
)

# first set of methods in pipeline
norm_method <- list(
    none = counts,
    cpm = function(x) t(t(1e6 * counts(x)) / colSums(counts(x)))
)

# second set of methods in pipeline
transform <- list(
    none = identity,
    log2 = function(x) log2(x+1)
)

datasets %>%
    apply_methods(norm_method)

## -----------------------------------------------------------------------------
datasets %>%
    apply_methods(norm_method) %>%
    apply_methods(transform)

## ---- eval = FALSE------------------------------------------------------------
#  # set cellbench to use 4 threads
#  set_cellbench_threads(4)

## ---- eval = FALSE------------------------------------------------------------
#  set_cellbench_cache_path(".CellBenchCache")
#  methods <- list(
#      method1 = cache_method(method1),
#      method2 = cache_method(method2)
#  )

## ---- eval = FALSE------------------------------------------------------------
#  # clears the cache set by set_cellbench_cache_path() in the same session
#  clear_cellbench_cache()

## -----------------------------------------------------------------------------
# f is a function of three parameters
f <- function(x, y, z) {
    x + y + z
}

# f_list is a list of functions with two of the parameters pre-filled
f_list <- fn_arg_seq(f, y = 1:2, z = 3:4)

f_list

## -----------------------------------------------------------------------------
names(f_list)[1]
g <- f_list[[1]]
g(10)

names(f_list)[2]
h <- f_list[[2]]
h(20)