knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html
)
## Track time spent on making the vignette
startTime <- Sys.time()

## Bib setup
library("RefManageR")

## Write bibliography information
bib <- c(
    R = citation(),
    BiocStyle = citation("BiocStyle")[1],
    knitr = citation("knitr")[1],
    RefManageR = citation("RefManageR")[1],
    rmarkdown = citation("rmarkdown")[1],
    sessioninfo = citation("sessioninfo")[1],
    testthat = citation("testthat")[1],
    DeconvoBuddies = citation("DeconvoBuddies")[1]
)

Introduction

What is Deconvolution?

Inferring the composition of different cell types in a bulk RNA-seq data

Deconvolution is a analysis that aims to calculate the proportion of different cell types that make up a sample of bulk RNA-seq, based off of cell type gene expression profiles in a single cell/nuclei RNA-seq dataset.

![Use single cell data to infer the composition of bulk RNA-seq samples](http://research.libd.org/DeconvoBuddies/reference/figures/Deconvolution.png){width=50%}

Deconvolution Methods

| Approach | Method | Citation | Availability | | ---------- | -------- | ------- | ----------- |
| weighted least squares | DWLS | Tsoucas et al, Nature Comm, 2019 | R Package Cran | | Bias correction: Assay | Bisque | Jew et al, Nature Comm, 2020 | R Package github | | Bias correction: Sourse | MuSiC | Wang et al, Nature Communications, 2019 | R Package github | | Machine Learning | CIBERSORTx | Newman et al., Nature BioTech, 2019 | Webtool | | Bayesian | BayesPrism | Chu et al., Nature Cancer, 2022 | Webtool/R Package | | linear | Hspe | Hunt et al., Ann. Appl. Stat, 2021 | R package github |

Goals of this Vignette

We will be demonstrating how to use DeconvoBuddies tools when applying deconvolution with the Bisque package.

  1. Install and load required packages
  2. Download DLPFC RNA-seq data, and reference snRNA-seq data
  3. Find marker genes with DeconvoBuddies tools
  4. Run deconvolution with BisqueRNA
  5. Explore deconvolution output and create compostion plots with DeconvoBuddies tools
  6. Check proportion against RNAScope estimated proportions

Basics

1. Install and load required packages

R is an open-source statistical environment which can be easily modified to enhance its functionality via packages. r Biocpkg("DeconvoBuddies") is a R package available via the Bioconductor repository for packages. R can be installed on any operating system from CRAN after which you can install r Biocpkg("DeconvoBuddies") by using the following commands in your R session:

Install DeconvoBuddies

if (!requireNamespace("BiocManager", quietly = TRUE)) {
      install.packages("BiocManager")
  }

BiocManager::install("DeconvoBuddies")

## Check that you have a valid Bioconductor installation
BiocManager::valid()

Load Other Packages

## install Bisque from cran
# install.packages("BisqueRNA")

library("spatialLIBD")
library("DeconvoBuddies")
library("SummarizedExperiment")
library("SingleCellExperiment")
library("BisqueRNA")
library("dplyr")
library("tidyr")
library("tibble")
library("ggplot2")

2. Download DLPFC RNA-seq data, and reference snRNA-seq data.

Bulk RNA-seq data

Access the 110 sample Human DLPFC bulk RNA-seq dataset for LIBD. These samples
are from 19 tissue blocks, and 10 neurotypical adult donors. Samples were sequenced with two different library_types (polyA and RiboZeroGold), and three different RNA_extraction (Cyto, Total, Nuc), post quality control n=110 samples.

## use fetch deconvon data to load rse_gene
rse_gene <- fetch_deconvo_data("rse_gene")
rse_gene
# lobstr::obj_size(rse_gene)
# 41.16 MB

## Use Symbol as rownames

rownames(rse_gene) <- rowData(rse_gene)$Symbol

## bulk RNA seq samples were sequenced with different library types, and RNA extractions
table(rse_gene$library_type, rse_gene$library_prep)

Refernce snRNA-seq data

This data is paired with a single nucleus RNA-seq data set from spatialLIBD. This dataset can be accessed with spatialLIBD::fetch_data().

## Use spatialLIBD to fetch the snRNA-seq dataset
 sce_path_zip <- fetch_deconvo_data("sce")
 sce_path <- unzip(sce_path_zip, exdir = tempdir())
 sce <- HDF5Array::loadHDF5SummarizedExperiment(
     file.path(tempdir(), "sce_DLPFC_annotated"))

 # lobstr::obj_size(sce)
 # 172.28 MB

 ## exclude Ambiguous cell type
 sce <- sce[,sce$cellType_broad_hc != "Ambiguous"]
 sce$cellType_broad_hc <- droplevels(sce$cellType_broad_hc)

 dim(sce)
 ## Check the broad cell type distribution 
 table(sce$cellType_broad_hc)

 ## We're going to subset to the first 5k genes to save memory
 sce <- sce[seq_len(5000),]

Refernce snRNA-seq data

The cell type proportions from the RNAScope experiment used to evaluate the deconvolution estimations.

Key columns:

head(DeconvoBuddies::RNAScope_prop)

## plot the RNAScope compositions
plot_composition_bar(prop_long = RNAScope_prop,
                      sample_col = "SAMPLE_ID",
                      x_col = "SAMPLE_ID",
                     add_text = FALSE) +
  facet_wrap(~Combo, nrow = 2, scales = "free_x")

3. Find marker genes with DeconvoBuddies::get_mean_ratio

Mean Ratio marker finding

![Use Mean Ratio to find less noisy marker genes htan 1vALL](http://research.libd.org/DeconvoBuddies/reference/figures/get_mean_ratio.png)

marker_stats <- get_mean_ratio(sce, 
                               cellType_col = "cellType_broad_hc",
                               gene_ensembl = "gene_id",
                               gene_name = "gene_name")

marker_stats |> group_by(cellType.target) |> slice(1)

plot the top marker genes

plot_marker_express(sce, 
                    stats = marker_stats, 
                    cell_type = "Excit", 
                    cellType_col = "cellType_broad_hc",
                    rank_col = "MeanRatio.rank",
                    anno_col = "MeanRatio.anno",
                    gene_col = "gene"
                      )

Create a List of Marker Genes

marker_genes <- marker_stats |>
  filter(MeanRatio.rank <= 25 & gene %in% rownames(rse_gene))

marker_genes |> count(cellType.target)

marker_genes <- marker_genes |> pull(gene)

4. Prep Data and Run Bisque

prep data

convert to ExpressionSet format, filter for cells with no counts across marker genes

exp_set_bulk <- Biobase::ExpressionSet(
    assayData = assays(rse_gene[marker_genes, ])$counts,
    phenoData = AnnotatedDataFrame(
        as.data.frame(colData(rse_gene))[c("SAMPLE_ID")]
    )
)

exp_set_sce <- Biobase::ExpressionSet(
    assayData = as.matrix(assays(sce[marker_genes, ])$counts),
    phenoData = AnnotatedDataFrame(
        as.data.frame(colData(sce))[, c("cellType_broad_hc", "BrNum")]
    )
)

## check for nuclei with 0 marker expression
zero_cell_filter <- colSums(exprs(exp_set_sce)) != 0
message("Exclude ", sum(!zero_cell_filter), " cells")

exp_set_sce <- exp_set_sce[, zero_cell_filter]

Run Bisque

est_prop <- ReferenceBasedDecomposition(
    bulk.eset = exp_set_bulk,
    sc.eset = exp_set_sce,
    cell.types = "cellType_broad_hc",
    subject.names = "BrNum",
    use.overlap = FALSE
)

Explore Ouput

est_prop$bulk.props <- t(est_prop$bulk.props)

head(est_prop$bulk.props)

5. Explore deconvolution output and create compostion plots with DeconvoBuddies tools

Plot Composition Barplots of Bisque predicted compositions

## add Phenotype data to proportion estimates

pd <- colData(rse_gene) |>
    as.data.frame() |>
    select(SAMPLE_ID, Sample, library_combo)

## make proption estimates long so they are ggplot friendly
prop_long <- est_prop$bulk.props |>
    as.data.frame() |>
    tibble::rownames_to_column("SAMPLE_ID") |>
    tidyr::pivot_longer(!SAMPLE_ID, names_to = "cell_type", values_to = "prop") |>
    left_join(pd)

## create composition bar plots
## Average by sample
plot_composition_bar(prop_long = prop_long, sample_col = "SAMPLE_ID", x_col = "Sample", add_text = FALSE)

## for all library preparations by sample n=110
plot_composition_bar(prop_long = prop_long, sample_col = "SAMPLE_ID", x_col = "SAMPLE_ID", add_text = FALSE) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

6. Check proportion against RNAScope estimated proportions

Note to compare the deconvolution results to the RNAScope proportions, Oligo and OPC need to be added together.

prop_long_opc <- prop_long |>
  mutate(cell_type = gsub("Oligo|OPC", "OligoOPC", cell_type)) |>
  group_by(SAMPLE_ID, Sample, library_combo, cell_type) |>
  summarize(prop = sum(prop)) |>
  ungroup()

prop_long_opc |> count(cell_type)

## Join RNAScope and Bisque Proportions
prop_compare <- prop_long_opc |>
  inner_join(RNAScope_prop |>
              select(Sample, cell_type, prop_RNAScope = prop, prop_sn), 
            by = c("Sample", "cell_type"))

Calculate the correlation plot a scatter plot of the proportions

## correlation with RNAScope proportions
cor(prop_compare$prop, prop_compare$prop_RNAScope)

## Scatter plot with RNAScope proportions
prop_compare |>
  ggplot(aes(x = prop_RNAScope, y = prop, color = cell_type, shape = library_combo)) +
  geom_point() +
  geom_abline()


## correlation with snRNA-seq proportion
cor(prop_compare$prop, prop_compare$prop_sn)

## Scatter plot with RNAScope proportions
prop_compare |>
  ggplot(aes(x = prop_sn, y = prop, color = cell_type, shape = library_combo)) +
  geom_point() +
  geom_abline()

7. How to run deconvolution with hspe

hspe is downloadable as a tar.gz but can't be shown on this vignettes

Code to prep-input and run hspe pseudobulk the snRNA-seq data (not run here)

if(FALSE){
  sce_pb <- spatialLIBD::registration_pseudobulk(sce, 
                                               var_registration = "cellType_broad_hc",
                                               var_sample_id = "Sample")


## mixture samples is the gene expression from the bulk rse_gene
mixture_samples = t(assays(rse_gene)$logcounts)
mixture_samples[1:5, 1:5]

## pure samples is a vector of indexes of the differnt cell types
pure_samples = rafalib::splitit(sce_pb$cellType_broad_hc)

## refrence samples is the pseudobulked logcounts
reference_samples = t(assays(sce_pb)$logcounts)
reference_samples[1:5, 1:5]

ncol(mixture_samples) == ncol(reference_samples)

est_prop_hspe = hspe(Y = mixture_samples,
                       reference = reference_samples,
                       pure_samples = pure_samples,
                       markers = marker_genes,
                       seed = 10524)

}

Reproducibility

The r Biocpkg("DeconvoBuddies") package r Citep(bib[["DeconvoBuddies"]]) was made possible thanks to:

This package was developed using r BiocStyle::Biocpkg("biocthis").

Code for creating the vignette

## Create the vignette
library("rmarkdown")
system.time(render("Deconvolution_Benchmark_DLPFC.Rmd", "BiocStyle::html_document"))

## Extract the R code
library("knitr")
knit("Deconvolution_Benchmark_DLPFC.Rmd", tangle = TRUE)

Date the vignette was generated.

## Date the vignette was generated
Sys.time()

Wallclock time spent generating the vignette.

## Processing time in seconds
totalTime <- diff(c(startTime, Sys.time()))
round(totalTime, digits = 3)

R session information.

## Session info
library("sessioninfo")
options(width = 120)
session_info()

Bibliography

This vignette was generated using r Biocpkg("BiocStyle") r Citep(bib[["BiocStyle"]]) with r CRANpkg("knitr") r Citep(bib[["knitr"]]) and r CRANpkg("rmarkdown") r Citep(bib[["rmarkdown"]]) running behind the scenes.

Citations made with r CRANpkg("RefManageR") r Citep(bib[["RefManageR"]]).

## Print bibliography
PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))


lahuuki/DeconvoBuddies documentation built on May 5, 2024, 9:35 a.m.