knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") library(devtools) install_github("biqing-zhu/MARBLES") library(MARBLES)
The main function is
?run_MRF()
It requires three input values: (1) paraMRF: Starting value of the model parameter $\Phi$; (2) expr: A pseudo-bulk expression list for a gene. Each item is a vector of the pseudo-bulk expression across all individuals for a cell type in a condition. The list names should start with 'Cond1_' or 'Cond2_', and followed by the name of the cell type; (3) c_c: A binary cell type relationship network matrix. 1 means connected, and 0 means not connected; and (4) x_init: Initial DE status. Should be a binary vector of length n_k. 1 means DE, and 0 means EE. The usage of this function will be illustrated in the following real data application.
We obtained the mouse cortex snRNA-seq dataset from the R package r BiocStyle::Biocpkg("muscData")
[@crowell2020muscat]. The data has already been pre-processed as well as annotated, and contains four control and four lipopolysaccharide (LPS)-treated mice.
library(muscData) library(muscat) mouse <- Crowell19_4vs4() mouse <- prepSCE(mouse, kid = "cluster_id", # subpopulation assignments gid = "group_id", # group IDs (ctrl/stim) sid = "sample_id", # sample IDs (ctrl/stim.1234) drop = TRUE) # drop all other colData columns
Since we wanted to focus on neurons and glial cells, we only selected astrocytes, microglia, oligodendrocyte progenitor cells (OPC), oligodendrocytes, excitatory neurons, and inhibitory neurons for the downstream analyses, and only kept the first 1,000 genes for demonstration purposes. And we applied the function aggregateData
in the R package r BiocStyle::Biocpkg("muscat")
[@crowell2020muscat] to get cell-type-specific pseudobulk data.
pb <- aggregateData(mouse[1:1000, !mouse$cluster_id %in% c('Endothelial', "CPE cells")])
Then, we created a cell type relationship network based on domain knowledge to represent the similarity and cell lineage.
## Create connection matrix (OPC connected to neurons) celltypes <- c('Astrocytes', 'Excit. Neuron', 'Inhib. Neuron', 'Microglia', 'Oligodendrocytes', 'OPC') c_c <- matrix(1, nrow = length(celltypes), ncol = length(celltypes)) rownames(c_c) <- colnames(c_c) <- celltypes conn <- c(0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1) c_c[lower.tri(c_c)] <- conn c_c <- t(c_c) c_c[lower.tri(c_c)] <- conn c_c
r BiocStyle::Biocpkg("edgeR")
[@robinson2010edger] was first applied to the pseudo-bulk data to get the DE states using pbDS
res_edger <- pbDS(pb, method = "edgeR", verbose = FALSE) tbl_edger <- res_edger$table[[1]] names(tbl_edger) # view results for 1st cluster k1 <- tbl_edger[[1]] head(format(k1[, -ncol(k1)], digits = 2))
Next, genes with FDR corrected p value >= 0.05, and abs(logFC) <=1 were filtered out.
results_fil <- lapply(tbl_edger, function(u) { u <- dplyr::filter(u, p_adj.loc < 0.05 & abs(logFC) > 1) dplyr::arrange(u, p_adj.loc) }) results_gene <- lapply(results_fil, function(u) u$gene) pseudo_bulk_lst <- pb@assays@data@listData
In order to input the data into our model, it's structure was rearranged such that each element of the list represents a gene, and within each element, each item is a vector of the pseudo-bulk expression across all individuals for a cell type in a condition.
pseudo_bulk_lst_new <- list() for(gene in rownames(pseudo_bulk_lst[[1]])) { pseudo_bulk_lst_new[[gene]] <- list() for(celltype in celltypes) { pseudo_bulk_lst_new[[gene]][[paste0('Cond1_', celltype)]] <- unlist(pseudo_bulk_lst[[celltype]][gene, 1:4]) pseudo_bulk_lst_new[[gene]][[paste0('Cond2_', celltype)]] <- unlist(pseudo_bulk_lst[[celltype]][gene, 5:8]) } } str(pseudo_bulk_lst_new[[1]])
Finally, we ran our method with edgeR initialization.
x_mat_edger <- matrix(0, length(pseudo_bulk_lst_new), length(celltypes)) rownames(x_mat_edger) <- names(pseudo_bulk_lst_new) colnames(x_mat_edger) <- celltypes for (celltype in celltypes) { x_mat_edger[results_fil[[celltype]]$gene, celltype] <- 1 } x_mat_edger <- lapply(seq_len(nrow(x_mat_edger)), function(i) x_mat_edger[i,]) results_edger <- mapply(run_MRF, expr = pseudo_bulk_lst_new, x_init = x_mat_edger, MoreArgs = list(paraMRF = c(0, 2), c_c = c_c), SIMPLIFY = FALSE)
The $\Phi$ parameter for the 10th gene can be retrieved by
results_edger[[10]]$phi_mat[,ncol(results_edger[[10]]$phi_mat)]
And the DE states is
results_edger[[10]]$x_mat[,ncol(results_edger[[10]]$x_mat)]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.