ddSingleCell: Multi-sample multi-group scRNA-seq data analysis tools

Documented in simData

#' simData
#' 
#' Simulation of complex scRNA-seq data 
#' 
#' \code{simData} simulates multiple clusters and samples 
#' across 2 experimental conditions from a real scRNA-seq data set.
#' 
#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
#' @param ng number of genes to simulate. Importantly, for the library sizes 
#'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 
#'   the number of simulated genes should match with the number of genes 
#'   in the reference. To simulate a reduced number of genes, e.g. for 
#'   testing and development purposes, please set \code{force = TRUE}.
#' @param nc number of cells to simulate.
#' @param nk number of clusters to simulate; defaults to the number
#'   of available reference clusters (\code{nlevels(x$cluster_id)}).  
#' @param ns number of samples to simulate; defaults to as many as
#'   available in the reference to avoid duplicated reference samples. 
#'   Specifically, the number of samples will be set to 
#'   \code{n = nlevels(x$sample_id)} when \code{dd = FALSE}, 
#'   \code{n} per group  when \code{dd, paired = TRUE}, and 
#'   \code{floor(n/2)} per group when \code{dd = TRUE, paired = FALSE}.
#'   When a larger number samples should be simulated, set \code{force = TRUE}.
#' @param probs a list of length 3 containing probabilities of a cell belonging
#'   to each cluster, sample, and group, respectively. List elements must be 
#'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
#' @param dd whether or not to simulate differential distributions; if TRUE, 
#'   two groups are simulated and \code{ns} corresponds to the number of 
#'   samples per group, else one group with \code{ns} samples is simulated.
#' @param p_dd numeric vector of length 6.
#'   Specifies the probability of a gene being
#'   EE, EP, DE, DP, DM, or DB, respectively.
#' @param paired logical specifying whether a paired design should 
#'   be simulated (both groups use the same set of reference samples) 
#'   or not (reference samples are drawn at random).
#' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
#'   to be shifted to a different expression state in one group (see details).
#' @param p_type numeric. Probability of EE/EP gene being a type-gene.
#'   If a gene is of class "type" in a given cluster, a unique mean 
#'   will be used for that gene in the respective cluster.
#' @param lfc numeric value to use as mean logFC 
#'   (logarithm base 2) for DE, DP, DM, and DB type of genes.
#' @param rel_lfc numeric vector of relative logFCs for each cluster. 
#'   Should be of length \code{nlevels(x$cluster_id)} with 
#'   \code{levels(x$cluster_id)} as names. 
#'   Defaults to factor of 1 for all clusters.
#' @param phylo_tree newick tree text representing cluster relations 
#'   and their relative distance. An explanation of the syntax can be found 
#'   \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}. 
#'   The distance between the nodes, except for the original branch, will be 
#'   translated in the number of shared genes between the clusters belonging to 
#'   these nodes (this relation is controlled with \code{phylo_pars}). 
#'   The distance between two clusters is defined as the sum 
#'   of the branches lengths separating them. 
#' @param phylo_pars vector of length 2 providing the parameters that control 
#'   the number of type genes. Passed to an exponential PDF (see details).
#' @param force logical specifying whether to force simulation 
#'   when \code{ng} and/or \code{ns} don't match the number of 
#'   available reference genes and samples, respectively.
#'   
#' @details The simulation of type genes can be performed in 2 ways; 
#'   (1) via \code{p_type} to simulate independent clusters, OR 
#'   (2) via \code{phylo_tree} to simulate a hierarchical cluster structure.
#'   
#'   For (1), a subset of \code{p_type} \% of genes are selected per cluster
#'   to use a different references genes than the remainder of clusters,
#'   giving rise to cluster-specific NB means for count sampling.
#'   
#'   For (2), the number of shared/type genes at each node 
#'   are given by \code{a*G*e^(-b*d)}, where \itemize{
#'   \item{\code{a} -- controls the percentage of shared genes between nodes.
#'   By default, at most 10\% of the genes are reserved as type genes
#'   (when \code{b} = 0). However, it is advised to tune this parameter 
#'   depending on the input \code{prep_sce}.}
#'   \item{\code{b} -- determines how the number of shared genes 
#'   decreases with increasing distance d between clusters 
#'   (defined through \code{phylo_tree}).}}
#'  
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
#'   containing multiple clusters & samples across 2 groups 
#'   as well as the following metadata: \describe{
#'   \item{cell metadata (\code{colData(.)})}{a \code{DataFrame} containing,
#'   containing, for each cell, it's cluster, sample, and group ID.}
#'   \item{gene metadata (\code{rowData(.)})}{a \code{DataFrame} containing, 
#'   for each gene, it's \code{class} (one of "state", "type", "none") and 
#'   specificity (\code{specs}; NA for genes of type "state", otherwise 
#'   a character vector of clusters that share the given gene).}
#'   \item{experiment metadata (\code{metadata(.)})}{
#'   \describe{
#'   \item{\code{experiment_info}}{a \code{data.frame} 
#'   summarizing the experimental design.}
#'   \item{\code{n_cells}}{the number of cells for each sample.}
#'   \item{\code{gene_info}}{a \code{data.frame} containing, for each gene
#'   in each cluster, it's differential distribution \code{category},
#'   mean \code{logFC} (NA for genes for categories "ee" and "ep"),
#'   gene used as reference (\code{sim_gene}), dispersion \code{sim_disp},
#'   and simulation means for each group \code{sim_mean.A/B}.}
#'   \item{\code{ref_sids/kidskids}}{the sample/cluster IDs used as reference.}
#'   \item{\code{args}}{a list of the function call's input arguments.}}}}
#'   
#' @examples
#' data(example_sce)
#' library(SingleCellExperiment)
#' 
#' # prep. SCE for simulation
#' ref <- prepSim(example_sce)
#' 
#' # simulate data
#' (sim <- simData(ref, nc = 200,
#'   p_dd = c(0.9, 0, 0.1, 0, 0, 0),
#'   ng = 100, force = TRUE,
#'   probs = list(NULL, NULL, c(1, 0))))
#' 
#' # simulation metadata
#' head(gi <- metadata(sim)$gene_info)
#' 
#' # should be ~10% DE  
#' table(gi$category)
#' 
#' # unbalanced sample sizes
#' sim <- simData(ref, nc = 100, ns = 2,
#'   probs = list(NULL, c(0.25, 0.75), NULL),
#'   ng = 10, force = TRUE)
#' table(sim$sample_id)
#' 
#' # one group only
#' sim <- simData(ref, nc = 100,
#'   probs = list(NULL, NULL, c(1, 0)),
#'   ng = 10, force = TRUE)
#' levels(sim$group_id)
#' 
#' # HIERARCHICAL CLUSTER STRUCTURE
#' # define phylogram specifying cluster relations
#' phylo_tree <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);"
#' # verify syntax & visualize relations
#' library(phylogram)
#' plot(read.dendrogram(text = phylo_tree))
#' 
#' # let's use a more complex phylogeny
#' phylo_tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3':
#'   0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
#' plot(read.dendrogram(text = phylo_tree))
#' 
#' # simulate clusters accordingly
#' sim <- simData(ref, 
#'   phylo_tree = phylo_tree, 
#'   phylo_pars = c(0.1, 3), 
#'   ng = 500, force = TRUE)
#' # view information about shared 'type' genes
#' table(rowData(sim)$class)
#' 
#' @author Helena L Crowell & Anthony Sonrel
#' 
#' @references 
#' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
#' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
#' On the discovery of population-specific state transitions from 
#' multi-sample multi-condition single-cell RNA sequencing data. 
#' \emph{bioRxiv} \strong{713412} (2018). 
#' doi: \url{https://doi.org/10.1101/713412}
#' 
#' @importFrom data.table data.table
#' @importFrom dplyr mutate_all mutate_at
#' @importFrom edgeR DGEList estimateDisp glmFit
#' @importFrom purrr modify_depth set_names
#' @importFrom stats model.matrix rgamma setNames
#' @importFrom SingleCellExperiment SingleCellExperiment
#' @importFrom SummarizedExperiment colData
#' @importFrom S4Vectors split unfactor
#' @export

simData <- function(x, 
    ng = nrow(x), nc = ncol(x), 
    ns = NULL, nk = NULL, probs = NULL, 
    dd = TRUE, p_dd = diag(6)[1, ], paired = FALSE,
    p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
    p_type = 0, lfc = 2, rel_lfc = NULL, 
    phylo_tree = NULL, phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3),
    force = FALSE) {
    
    # throughout this code...
    # k: cluster ID
    # s: sample ID
    # g: group ID
    # c: DD category
    # 0: reference
    
    # add mock cluster/sample ID if missing
    if (is.null(x$cluster_id)) {
        x$cluster_id <- factor("foo")
        no_k <- TRUE
    } else {
        x$cluster_id <- droplevels(factor(x$cluster_id))
        no_k <- nlevels(x$cluster_id) == 1
    }
    if (is.null(x$sample_id)) {
        x$sample_id <- factor("foo")
        no_s <- TRUE
    } else {
        x$sample_id <- droplevels(factor(x$sample_id))
        no_s <- nlevels(x$sample_id) == 1
    }
    
    # store all input arguments to be returned in final output
    args <- c(as.list(environment()))
    
    # check validity of input arguments
    .check_sce(x, req_group = FALSE)
    args_tmp <- .check_args_simData(as.list(environment()))
    nk <- args$nk <- args_tmp$nk

    # reference IDs
    nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
    ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
    
    # get number of samples to simulate
    ns <- .get_ns(ns0, ns, dd, paired, force)
    
    # simulation IDs
    nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
    sids <- set_names(paste0("sample", seq_len(ns)))
    gids <- set_names(c("A", "B"))
    
    # sample reference clusters & samples
    ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids)
    if (!dd || paired) {
        # use same set of reference samples for both groups
        ref_sids <- sample(sids0, ns, ns > ns0)
        ref_sids <- cbind(ref_sids, ref_sids)
    } else {
        # draw reference samples at random for each group
        sidsA <- sample(sids0, ns, force && ns > ns0)
        if (force) {
            sidsB <- sample(sids0, ns, ns > ns0)
        } else {
            sidsB <- setdiff(sids0, sidsA)
            sidsB <- sample(sidsB, ns)
        }
        ref_sids <- cbind(sidsA, sidsB)
    }
    dimnames(ref_sids) <- list(sids, gids)

    if (is.null(rel_lfc)) 
        rel_lfc <- rep(1, nk)
    if (is.null(names(rel_lfc))) {
        names(rel_lfc) <- kids
    } else {
        stopifnot(names(rel_lfc) %in% kids0)
    }
    
    # initialize count matrix
    gs <- paste0("gene", seq_len(ng))
    cs <- paste0("cell", seq_len(nc))
    y <- matrix(0, ng, nc, dimnames = list(gs, cs))
    
    # sample cell metadata
    cd <- .sample_cell_md(
        n = nc, probs = probs,
        ids = list(kids, sids, gids))
    rownames(cd) <- cs
    cs_idx <- .split_cells(cd, by = colnames(cd))
    n_cs <- modify_depth(cs_idx, -1, length)
    
    # split input cells by cluster-sample
    cs_by_ks <- .split_cells(x)
    
    # sample nb. of genes to simulate per category & gene indices
    n_dd <- table(sample(cats, ng, TRUE, p_dd))
    n_dd <- replicate(nk, n_dd)
    colnames(n_dd) <- kids
    gs_idx <- .sample_gene_inds(gs, n_dd)
    
    # for ea. cluster, sample set of genes to simulate from
    gs_by_k <- setNames(sample(rownames(x), ng, ng > nrow(x)), gs)
    gs_by_k <- replicate(nk, gs_by_k)
    colnames(gs_by_k) <- kids

    # when 'phylo_tree' is specified, induce hierarchical cluster structure
    if (!is.null(phylo_tree)) {                                  
        res <- .impute_shared_type_genes(x, gs_by_k, gs_idx, phylo_tree, phylo_pars)
        gs_by_k <- res$gs_by_k
        class  <- res$class
        specs <- res$specs
    # otherwise, simply impute type-genes w/o phylogeny
    } else if (p_type != 0) {
        res <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
        stopifnot(!any(res$class == "shared"))
        gs_by_k <- res$gs_by_k
        class <- res$class
        specs <- res$specs
    } else {
        class <- rep("state", ng)
        specs <- rep(NA, ng)
        names(class) <- names(specs) <- gs
    }

    # split by cluster & categroy
    gs_by_k <- split(gs_by_k, col(gs_by_k))
    gs_by_k <- setNames(map(gs_by_k, set_names, gs), kids)
    gs_by_kc <- lapply(kids, function(k) 
        lapply(unfactor(cats), function(c) 
            gs_by_k[[k]][gs_idx[[c, k]]])) 
    
    # sample logFCs
    lfc <- vapply(kids, function(k) 
        lapply(unfactor(cats), function(c) { 
            n <- n_dd[c, k]
            if (c == "ee") return(rep(NA, n))
            signs <- sample(c(-1, 1), n, TRUE)
            lfcs <- rgamma(n, 4, 4/lfc) * signs
            names(lfcs) <- gs_by_kc[[k]][[c]]
            lfcs * rel_lfc[k]
        }), vector("list", length(cats)))

    # compute NB parameters
    bs <- rowData(x)$beta
    if (!is.null(bs$sample_id)) {
        bs$sample_id <- cbind(0, bs$sample_id)
        names(bs$sample_id) <- sids0
    }
    if (!is.null(bs$cluster_id)) {
        bs$cluster_id <- cbind(0, bs$cluster_id)
        names(bs$cluster_id) <- kids0
    }
    os <- x$offset
    b0 <- bs$beta0
    ds <- rowData(x)$disp
    
    # initialize list of depth two to store 
    # simulation means in each cluster & group
    sim_mean <- lapply(kids, function(k) 
        lapply(gids, function(g) 
            setNames(numeric(ng), gs)))
    
    # run simulation -----------------------------------------------------------
    for (k in kids) {
        for (s in sids) {
            # get output cell indices
            ci <- cs_idx[[k]][[s]]
            
            # get reference samples, clusters & cells
            s0 <- ref_sids[s, ]
            k0 <- ref_kids[k]
            cs0 <- cs_by_ks[[k0]][s0]
            
            # get NB parameters
            bs_ks <- cbind(b0,
                bs$cluster_id[[k0]],
                bs$sample_id[[s0[1]]])
            
            for (c in cats[n_dd[, k] != 0]) {
                # sample cells to simulate from
                cs_g1 <- sample(cs0[[1]], n_cs[[k]][[s]][[1]], TRUE)
                cs_g2 <- sample(cs0[[2]], n_cs[[k]][[s]][[2]], TRUE)
                
                # get reference genes & output gene indices
                gs0 <- gs_by_kc[[k]][[c]] 
                gi <- gs_idx[[c, k]]
                
                # get NB parameters
                ds_kc <- ds[gs0]
                lfc_kc <- lfc[[c, k]]
                bs_ksc <- exp(rowSums(bs_ks[gs0, , drop = FALSE]))
                ms_g1 <- outer(bs_ksc, exp(os[cs_g1]), "*")
                ms_g2 <- outer(bs_ksc, exp(os[cs_g2]), "*")
                
                re <- .sim(c, cs_g1, cs_g2, ms_g1, ms_g2, ds_kc, lfc_kc, p_ep, p_dp, p_dm)
                y[gi, unlist(ci)] <- re$cs
                
                for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
                    is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
            }
        }
    }
    # construct gene metadata table storing ------------------------------------
    # gene | cluster_id | category | logFC, gene, disp, mean used for sim.
    gi <- data.frame(
        gene = unlist(gs_idx),
        cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
        category = rep.int(rep(cats, nk), c(n_dd)),
        logFC = unlist(lfc),
        sim_gene = unlist(gs_by_kc),
        sim_disp = ds[unlist(gs_by_kc)]) %>% 
        mutate_at("gene", as.character)
    # add true simulation means
    sim_mean <- sim_mean %>%
        map(bind_cols) %>% 
        bind_rows(.id = "cluster_id") %>% 
        mutate(gene = rep(gs, nk))
    gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% 
        rename("sim_mean.A" = "A", "sim_mean.B" = "B") %>% 
        mutate_at("cluster_id", factor)
    # reorder
    o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
    gi <- gi[o, ]; rownames(gi) <- NULL
    
    # construct SCE ------------------------------------------------------------
    # cell metadata including group, sample, cluster IDs
    cd$group_id <- droplevels(cd$group_id)
    cd$sample_id <- if (dd) {
        factor(paste(cd$sample_id, cd$group_id, sep = "."))
    } else factor(cd$sample_id)
    m <- match(levels(cd$sample_id), cd$sample_id)
    gids <- cd$group_id[m]
    o <- order(gids)
    sids <- levels(cd$sample_id)[o]
    cd <- cd %>% 
        mutate_at("sample_id", factor, levels = sids) %>% 
        mutate_at("cluster_id", factor, levels = kids)
    if (!dd) {
        cd$group_id <- NULL
        ref_sids <- ref_sids[, 1]
    }
    if (no_s) cd$sample_id <- NULL
    if (no_k) cd$cluster_id <- NULL
    # gene metadata storing gene classes & specificities
    rd <- DataFrame(class = factor(class, 
        levels = c("state", "shared", "type")))
    rd$specs <- as.list(specs)
    # simulation metadata including used reference samples/cluster, 
    # list of input arguments, and simulated genes' metadata
    ei <- data.frame(sample_id = sids, group_id = gids[o])
    md <- list(
        experiment_info = ei,
        n_cells = table(cd$sample_id),
        gene_info = gi,
        ref_sids = ref_sids,
        ref_kids = ref_kids, 
        args = args)
    # return SCE 
    SingleCellExperiment(
        assays = list(counts = as.matrix(y)),
        colData = cd, rowData = rd, metadata = md)
}