Intro.R
In cellGeometry: Geometric Single Cell Deconvolution

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE
)

## ----eval=FALSE---------------------------------------------------------------
# # Bioconductor must be installed +/- updated first
# BiocManager::install(version = "3.xx")  # set to latest version
# 
# # minimum necessary Bioconductor packages to install cellGeometry package
# BiocManager::install(c("ensembldb", "DelayedArray"))
# 
# # packages needed to read h5ad files
# BiocManager::install(c("zellkonverter", "rhdf5", "HDF5Array"))
# 
# # optional, if you are using Seurat
# install.packages("Seurat")
# 
# # package needed to convert ensembl gene ids to symbols
# BiocManager::install("AnnotationHub")

## ----eval=FALSE---------------------------------------------------------------
# install.packages("cellGeometry")

## ----eval=FALSE---------------------------------------------------------------
# devtools::install_github("myles-lewis/cellGeometry")

## ----out.width='100%', echo=FALSE---------------------------------------------
knitr::include_graphics("workflow.png")

## ----eval=FALSE---------------------------------------------------------------
# library(zellkonverter)
# library(SingleCellExperiment)
# library(cellGeometry)
# 
# typist_h5 <- readH5AD("2ac906a5-9725-4258-8e36-21a9f6c0302a.h5ad",
#                       use_hdf5 = TRUE, reader = "R")

## ----eval=FALSE---------------------------------------------------------------
# mat <- typist_h5@assays@data$X
# rownames(mat) <- rownames(typist_h5)
# meta <- typist_h5@colData@listData

## ----eval=FALSE---------------------------------------------------------------
# library(Seurat)
# typist <- readRDS("2ac906a5-9725-4258-8e36-21a9f6c0302a.rds")  # 15.5 GB in memory
# 
# mat <- typist@assays$RNA$counts
# meta <- typist@meta.data

## ----eval=FALSE---------------------------------------------------------------
# table(meta$Majority_voting_CellTypist)
# 
# subcl <- meta$Majority_voting_CellTypist
# cellgrp <- meta$Majority_voting_CellTypist_high
# 
# # reduce dataset to only blood (optional)
# subcl[meta$tissue != "blood"] <- NA
# cellgrp[meta$tissue != "blood"] <- NA

## ----eval=FALSE---------------------------------------------------------------
# mk <- cellMarkers(mat, subclass = subcl, cellgroup = cellgrp,
#                   dual_mean = TRUE, cores = 2)

## ----eval=FALSE---------------------------------------------------------------
# # example code using future for parallelisation on windows
# library(future)
# plan(multisession, workers = 4)
# 
# mk <- cellMarkers(mat, subclass = subcl, cellgroup = cellgrp,
#                   use_future = TRUE)

## ----eval=FALSE---------------------------------------------------------------
# library(AnnotationHub)
# ah <- AnnotationHub()
# ensDb_v110 <- ah[["AH113665"]]
# mk <- gene2symbol(mk, ensDb_v110)

## ----eval=FALSE---------------------------------------------------------------
# signature_heatmap(mk)  # visualise whole signature
# signature_heatmap(mk, top = 5)  # show top 5 genes for each subclass

## ----out.width='90%', echo=FALSE----------------------------------------------
knitr::include_graphics("typist_sig.png")

## ----eval=FALSE---------------------------------------------------------------
# spillover_heatmap(mk)

## ----out.width='80%', echo=FALSE----------------------------------------------
knitr::include_graphics("typist_spillover.png")

## ----eval=FALSE---------------------------------------------------------------
# cs <- cos_similarity(mk)
# ComplexHeatmap::Heatmap(cs)

## ----out.width='80%', echo=FALSE----------------------------------------------
knitr::include_graphics("typist_cos_sim.png")

## ----eval=FALSE---------------------------------------------------------------
# rank_angle(cs)

## ----eval=FALSE---------------------------------------------------------------
# diagnose(mk)

## ----eval=FALSE---------------------------------------------------------------
# mk <- updateMarkers(mk,
#                     remove_subclass = c("Helper T cells", "Cytotoxic T cells"))

## ----eval=FALSE---------------------------------------------------------------
# # simulated bulk
# set.seed(3)
# sim_counts <- generate_samples(mk, 25)
# sim_percent <- sim_counts / rowSums(sim_counts) * 100

## ----eval=FALSE---------------------------------------------------------------
# # sample from original sc count matrix
# # 1.43 mins (Intel); 45 secs (ARM)
# set.seed(99)
# times <- 1  # can be increased
# sim_sampled <- simulate_bulk(mat, sim_counts, subcl, times = times)
# 
# # fix rownames
# rownames(sim_sampled) <- gene2symbol(rownames(sim_sampled), ensDb_v110)

## ----eval=FALSE---------------------------------------------------------------
# # near optimal deconvolution of counts sampled from the original scRNA-Seq
# fit <- deconvolute(mk, sim_sampled,
#                    use_filter = FALSE, arith_mean = TRUE)
# 
# # plot results - multiple scatter plots
# plot_set(sim_counts, fit$subclass$output / times)  # adjust for oversampling using `times`
# plot_set(sim_percent, fit$subclass$percent)
# 
# # alternative plot - single scatter plot
# plot_pred(sim_counts, fit$subclass$output / times)
# 
# # stats table
# metric_set(sim_percent, fit$subclass$percent)

## ----out.width='90%', echo=FALSE----------------------------------------------
knitr::include_graphics("typist_sim.png")

## ----eval=FALSE---------------------------------------------------------------
# mk <- updateMarkers(mk, bulkdata = my_bulk_matrix)

## ----eval=FALSE---------------------------------------------------------------
# res <- tune_deconv(mk, sim_sampled, sim_counts * times,
#                    grid = list(nsubclass = c(5, 10, 15, 25, 50, 100, 200, 500, 1000),
#                                expfilter = c(0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.5),
#                                weight_method = c("none", "equal")),
#                    arith_mean = TRUE,
#                    use_filter = FALSE)
# 
# ## Tuning parameters: nsubclass, expfilter, weight_method
# ##   |==========================================================| 100%, Elapsed 00:07
# ## Best tune:
# ##   nsubclass  expfilter  weight_method  mean.RMSE
# ##        1000       0.25          equal      35.21

## ----eval=FALSE---------------------------------------------------------------
# plot_tune(res, xvar = "nsubclass", group = "weight_method")
# plot_tune(res, xvar = "nsubclass", group = "expfilter")
# plot_tune(res, xvar = "expfilter", group = "weight_method")
# plot_tune(res, xvar = "expfilter", group = "nsubclass")

## ----out.width='90%', echo=FALSE----------------------------------------------
knitr::include_graphics("typist_tune.png")

## ----eval=FALSE---------------------------------------------------------------
# # simple gaussian noise applied to counts
# sim_noise <- add_noise(sim_sampled)
# 
# # noise applied to log2 counts
# sim_noise <- log_noise(sim_sampled)
# 
# # noise applied to sqrt transformed counts
# sim_noise <- sqrt_noise(sim_sampled)
# 
# # whole genes are scaled up/down by a random amount
# # this simulates differences in chemistry
# sim_noise <- shift_noise(sim_sampled)

## ----eval=FALSE---------------------------------------------------------------
# res2 <- tune_deconv(mk, sim_noise, sim_counts,
#                    grid = list(nsubclass = c(5, 10, 15, 25, 50, 100, 200, 500, 1000),
#                                expfilter = c(0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.5),
#                                weight_method = c("none", "equal")),
#                    arith_mean = TRUE,
#                    use_filter = FALSE)

## ----eval=FALSE---------------------------------------------------------------
# # default method using var.e (see paper for details)
# se1 <- se(fit)
# head(se1)
# 
# # ordinary least squares
# se2 <- se(fit, type = "OLS")

## ----eval=FALSE---------------------------------------------------------------
# # default method using var.e (see paper for details)
# ci1 <- confint(fit)
# str(ci1)
# 
# # ordinary least squares
# ci2 <- confint(fit, type = "OLS")

## ----eval=FALSE---------------------------------------------------------------
# r <- residuals(fit)
# 
# # full table including all genes
# r1 <- residuals(fit, test = sim_sampled)

## ----eval=FALSE---------------------------------------------------------------
# # fitted values for bulk on x axis
# plot_residuals(fit)
# 
# # actual bulk gene expression - good for spotting outliers with massive expression
# plot_residuals(fit, sim_sampled)
# plot_residuals(fit, sim_sampled, type = "raw")
# plot_residuals(fit, sim_sampled, type = "student")

## ----eval=FALSE---------------------------------------------------------------
# rs1 <- rstandard(fit)
# rs2 <- rstudent(fit)
# cooksd <- cooks.distance(fit)

## ----eval=FALSE---------------------------------------------------------------
# sim_noise <- shift_noise(sim_sampled)
# fit2 <-  deconvolute(mk, sim_noise, npass = 4,
#                      use_filter = FALSE, arith_mean = TRUE)

## ----eval=FALSE---------------------------------------------------------------
# fit2 <-  deconvolute(mk, sim_noise,
#                      use_filter = FALSE, arith_mean = TRUE, comp_check = TRUE)
# 
# # show minimum deconvolution values across all samples to see where comp cut-offs occur
# plot_comp(fit2)
# 
# # show coefficient paths for a single subclass (across all samples)
# plot_path(fit2)
# 
# # show coef paths as cell percentages
# plot_path(fit2, type = "percent")
# 
# # show coef paths for the first bulk sample (across all subclasses)
# plot_path(fit2, sample = 1)
# plot_path(fit2, sample = 1, type = "percent")