# rmarkdown::render("vignettes/vignette.Rmd")
devtools::load_all()
library(dropsim)
library(data.table)
library(Matrix)
library(ggplot2)
knitr::opts_chunk$set(fig.width=10)

Model Description

This is a package for simulating single cell RNAseq data. The target is a digital gene expression matrix counts matrix C (N cells by L genes). If cells, genes, and cell types are indexed by n, l, and t respectively then:

$$ C_{nl} = \mathit{Poisson}\big( \frac{e_{t(n)l}}{\sum_{l=1}^{L} e_{t(n)l}} \ s_n \big) $$

Where e is the true baseline expression and s is the library size, each of which are drawn from a lognormal distribution. A realistic differential expression profile between cell types can be simulated by multiplying the baseline expression values e by a log-logistic distribution.

Simple Example

library(dropsim)
set.seed(42)

new_parameters <- new("dropsim_parameters",
                      n_genes = 15000L,
                      n_cells = 1000L,
                      gene_meanlog = -12,
                      gene_sdlog = 2.5,
                      library_meanlog = 10,
                      library_sdlog = 0.4,
                      groups = data.table::data.table(
                        scale = c(0.06,0.06,0.06,0.04),
                        cells = list(1:200, 201:600, 601:620, sample.int(1000, size=100, replace = TRUE)),
                        names = c("A","B","C","2")
                      )
                      )

# Simulate counts
dge <- simulateDGE(new_parameters, seed = 42)$counts

str(as.matrix(dge))

Summary Plots

We can then calculate gene-wise summaries and visualise the simulated dataset

# Summarise Counts matrix
summarised_dge <- summariseDGE(dge, name="Simulation 1")

# Plot summary plots
plot_summaryDGE(summarised_dge)

PCA

We can also do a PCA analysis to see if the groups separate

# Normalise Counts matrix
normalised_dge <- normaliseDGE(dge)

# Do a PCA to check
dge_pca <- prcomp(normalised_dge)

# Plot PCA
qplot(dge_pca$x[,1], dge_pca$x[,2], colour=rownames(dge_pca$x)) + labs(colour="Group", y="PC2", x="PC1")

Comparisons

If we have multiple datasets we can do comparisons

summarised_dge_2 <- summariseDGE(simulateDGE(dropsim_parameters(), seed=42)$counts, name="Simulation 2")
dispersionDGE(rbind(summarised_dge, summarised_dge_2)) + facet_wrap(~Name)

Simulate based on dataset

If you have data you want to match the properties of you can create parameters from them

new_parameters <- fit_parameters(dge)
print(new_parameters)


marchinilab/dropsim documentation built on May 17, 2019, 1:34 p.m.