R/reads_simulator.R

Defines functions donor_read_simulator sample_tree_SNV sample_seq_depth sim_read_count

Documented in donor_read_simulator sample_seq_depth sample_tree_SNV sim_read_count

#' Synthetic reads generator for genetic variants
#'
#' There are following steps to generate the simulated reads counts for variants
#' in single cells:
#' 1) given the clonal genotype and the clonal prevalence, the genotypes (i.e,
#' the clone) of cells will be generated following a multinomial distribution.
#' Note, one cell may contain variants from two clones when it is a doublet.
#' 2) given the distribution of reads coverage, e.g., a matrix of read coverage
#' from real data, (variant specific), the total reads of each variant will be
#' generated by random sampling. Note, the missing rate is governed by this
#' matrix.
#' 3) the allelic frequency of each variant will be generated by following a
#' beta distribution with parameters of mean and variance.
#' 4) Given the genotype of a cell, if the mutation exists in a cell, the
#' alteration read counts will be generated by a binomial distribution,
#' parameterized the allelic frequency, sampled from step 3.
#' 5) Given the genotype of a cell, if the mutation does not exist in a cell,
#' the alteration read counts will be generated by a binomial distribution,
#' parameterized by the technical error rate.
#'
#' @export
#'
#' @param Config A matrix of binary values. The clone-variant configuration,
#' which encodes the phylogenetic tree structure, and the genotype of each clone
#' @param D A matrix of integers. Sequencing depth for N variants across x
#' cells (ideally >100 cells). NA means 0 here.
#' @param Psi A vector of float. The fractions of each clone. If NULL, set a
#' uniform distribution.
#' @param means A vector of two floats. The mean theta_1 (false positive rate)
#' and the mean theta_2 (true positive rate).
#' @param vars A vector of two floats. The variance of theta_1 and theta_2.
#' @param cell_num A integer. The number of cells to generate.
#' @param permute_D A Boolean value. If True permute variants in D.
#' @param wise0 A string, the beta-binomial parameter specificity for theta0:
#' global, variant, element.
#' @param wise1 A string, the beta-binomial parameter specificity for theta1:
#' global, variant, element.
#' @param doublet A float between 0 and 1, the rate of doublets
#' @param sample_cell A Boolean value. If True and M > ncol(D), sample cells.
#'
#' @return a list containing \code{A_sim}, a matrix for alteration reads,
#' \code{A_sim}, a matrix for total reads, \code{I_sim}, a matrix for clonal
#' label, \code{H_sim}, a matrix for genotype, \code{theta0}, a matrix of
#' expected false positive rate, \code{theta1}, a matrix of expected true
#' positive rate, \code{theta0_binom}, theta0 as binomial parameter,
#' \code{theta1_binom}, theta0 as binomial parameter, and \code{is_doublet}, a
#' vector of Boolean value if a cell is a doublet
#'
#' @examples
#' data(simulation_input)
#' D2 <- sample_seq_depth(D_input, n_cells = 500, n_sites = nrow(tree_4clone$Z))
#' simu <- sim_read_count(tree_4clone$Z, D2, Psi = NULL, cell_num = 500)
sim_read_count <- function(Config, D, Psi = NULL,
    means = c(0.002, 0.45), vars = c(100, 1),
    wise0 = "element", wise1 = "variant", cell_num = 300,
    permute_D = FALSE, sample_cell = TRUE, doublet = 0.0) {
    M <- cell_num # number of cells
    K <- ncol(Config) # number of clones
    N <- nrow(Config) # number of variants

    shape1s <- means * vars
    shape2s <- (1.0 - means) * vars

    D[which(D == 0)] <- NA
    if (permute_D == TRUE) {
        D <- D[sample(nrow(D)), ]
    }
    if (sample_cell || M > ncol(D)) {
        D_sim <- D[, sample(ncol(D), M, replace = TRUE)]
    } else {
        D_sim <- D[, seq_len(M)]
    }

    # genotype for cells H_sim, and clone labels I_sim
    if (is.null(Psi)) {
        Psi <- rep(1 / K, K)
    }
    is_doublet <- rep(FALSE, M)
    is_doublet[sample(M, round(doublet * M))] <- TRUE
    I_sim <- rmultinom(M, 1, prob = Psi) # Clonal label for each cell
    I_sim[, is_doublet] <- rmultinom(sum(is_doublet), 2, prob = Psi)
    I_sim[which(I_sim > 1)] <- 1
    H_sim <- Config %*% I_sim # Genotype for each cell

    # generate p, D and A
    p_sim <- matrix(0, 2) # p1 and p2 for False positive and True positive
    A_sim <- matrix(NA, N, M) # Alteration counts matrix
    A_germ_sim <- matrix(NA, N, M) # Alteration counts matrix for germline var
    D_germ_sim <- matrix(NA, N, M)
    theta0_Bern_sim <- matrix(NA, N, M) # theta0 matrix
    theta1_Bern_sim <- matrix(NA, N, M) # theta1 matrix
    theta0_binom_sim <- matrix(NA, N, M) # theta0 matrix
    theta1_binom_sim <- matrix(NA, N, M) # theta1 matrix

    p_sim[1] <- stats::rbeta(1, shape1s[1], shape2s[1])
    p_sim[2] <- stats::rbeta(1, shape1s[2], shape2s[2])
    for (i in seq_len(N)) {
        if (wise0 == "variant") {
            p_sim[1] <- stats::rbeta(1, shape1s[1], shape2s[1])
        }
        if (wise1 == "variant") {
            p_sim[2] <- stats::rbeta(1, shape1s[2], shape2s[2])
        }
        for (j in seq_len(M)) {
            if (wise0 == "element") {
                p_sim[1] <- stats::rbeta(1, shape1s[1], shape2s[1])
            }
            if (wise1 == "element") {
                p_sim[2] <- stats::rbeta(1, shape1s[2], shape2s[2])
            }
            if (!is.na(D_sim[i, j])) {
                D_germ_sim[i, j] <- D_sim[i, j]
                A_germ_sim[i, j] <- rbinom(1, D_germ_sim[i, j], p_sim[2])

                theta0_binom_sim[i, j] <- p_sim[1]
                theta1_binom_sim[i, j] <- p_sim[2]
                theta0_Bern_sim[i, j] <- 1 - dbinom(0,
                    size = D_sim[i, j],
                    prob = p_sim[1]
                )
                theta1_Bern_sim[i, j] <- 1 - dbinom(0,
                    size = D_sim[i, j],
                    prob = p_sim[2]
                )
                if (H_sim[i, j] == 0) {
                    A_sim[i, j] <- rbinom(1, D_sim[i, j], p_sim[1])
                } else {
                    A_sim[i, j] <- rbinom(1, D_sim[i, j], p_sim[2])
                }
            }
        }
    }
    I_sim <- t(I_sim)
    colnames(I_sim) <- colnames(Config)
    row.names(A_sim) <- row.names(D_sim) <- row.names(Config)
    row.names(A_germ_sim) <- row.names(D_germ_sim) <- row.names(Config)

    cell_names <- paste0("cell", seq_len(ncol(A_sim)))
    row.names(I_sim) <- colnames(A_sim) <- colnames(D_sim) <- cell_names
    colnames(A_germ_sim) <- colnames(D_germ_sim) <- cell_names

    return_list <- list(
        "H_sim" = H_sim, "I_sim" = I_sim,
        "A_sim" = A_sim, "D_sim" = D_sim,
        "A_germ_sim" = A_germ_sim, "D_germ_sim" = D_germ_sim,
        "theta0" = theta0_Bern_sim, "theta1" = theta1_Bern_sim,
        "theta0_binom" = theta0_binom_sim,
        "theta1_binom" = theta1_binom_sim,
        "is_doublet" = is_doublet
    )
    return_list
}

#' Update matrix D with manually selected missing rate
#'
#' Given missing rate, the NA will be generated first. For none NA element,
#' sequencing depth with uniformly sampled from D, row wisely. Namely, the depth
#' is variant specific.
#'
#' @param D A matrix (N variants x M cells), the original sequencing coverage,
#' NA means missing
#' @param n_cells A integer, the number of the cells to generate
#' @param n_sites A integer, the number of variants to generate
#' @param missing_rate A float value, if NULL, use the same missing rate as D
#' @return a n_sites by n_cells matrix sampled from input D.
#'
#' @export
#' @examples
#' data(simulation_input)
#' D1 <- sample_seq_depth(D_input,
#'     n_cells = 500, n_sites = 50,
#'     missing_rate = 0.85
#' )
sample_seq_depth <- function(D, n_cells = NULL, n_sites = NULL,
    missing_rate = NULL) {
    D[which(D == 0)] <- NA
    if (is.null(n_cells)) {
        n_cells <- ncol(D)
    }
    if (is.null(n_sites)) {
        n_sites <- nrow(D)
    }
    missing_vector <- rowMeans(is.na(D))
    if (!is.null(missing_rate)) {
        if (missing_rate >= 1.0) {
            stop("missing rate should be < 1.0!")
        }
        m_upbound <- 1 - 1.0 / n_cells
        m_mean <- mean(missing_vector)
        amplify <- min(
            missing_rate / m_mean,
            (m_upbound - missing_rate) / (max(missing_vector) - m_mean)
        )
        missing_vector <- amplify * (missing_vector - m_mean) + missing_rate
    }

    idx <- sample(nrow(D), size = n_sites, replace = (n_sites > nrow(D)))
    D_input <- D[idx, ]
    missing_vector <- missing_vector[idx]

    D_output <- matrix(NA, nrow = n_sites, ncol = n_cells)
    for (i in seq_len(nrow(D_input))) {
        D_no_na <- D_input[i, which(!is.na(D_input[i, ]) & D_input[i, ] > 0)]
        D_output[i, ] <- sample(D_no_na, n_cells, replace = TRUE)
        missing_idx <- sample(n_cells, round(missing_vector[i] * n_cells),
            replace = FALSE
        )
        D_output[i, missing_idx] <- NA
    }

    D_output
}


#' Down sample number of SNVs in the tree
#'
#' @param tree A tree object from Canopy
#' @param n_SNV A integer, the number of SNVs to keep in the output tree
#' @return a phylo tree with down sampled variants
#'
#' @export
#' @examples
#' data(simulation_input)
#' tree_lite <- sample_tree_SNV(tree_4clone, n_SNV = 10)
sample_tree_SNV <- function(tree, n_SNV = NULL) {
    total_SNV <- nrow(tree$Z)
    if (is.null(n_SNV)) {
        n_SNV <- total_SNV
    }
    if (total_SNV < n_SNV) {
        stop("Only support down sample SNVs")
    }
    idx <- sample(total_SNV, size = n_SNV)

    out.tree <- tree
    out.tree$Z <- out.tree$Z[idx, ]
    out.tree$CCF <- out.tree$CCF[idx, , drop = FALSE]
    out.tree$VAF <- out.tree$VAF[idx, , drop = FALSE]
    out.tree$sna <- out.tree$sna[idx, , drop = FALSE]

    out.tree
}

#' Reads simulator for donor identification
#' @param GT Variant-by-donor matrix for genotypes
#' @param D_seed Variant-by-cell matrix for read coverage for generating depth,
#' which be row sample and column sample both with replacement
#' @param sample_variants logical(1), if TRUE, sample variants with replacement
#' to the same size, otherwise not
#' @param donor_size Vector of float for the fractions of each donor; default
#' NULL means uniform
#' @param beta_shapes A 3-by-2 matrix of beta parameters for genotypes: 0, 1,
#' and 2; default NULL means matrix(c(0.2, 0.5, 99.8, 99.8, 0.5, 0.2), nrow = 3)
#' @param n_cell An integer for number of total cells
#' @param doublet_rate A float from 0 to 1 for doublet rate; default NULL means
#' rate n_cell / 100000
#' 
#' @return A list of various components of the simulated dataset.
#' @export
#'
donor_read_simulator <- function(GT, D_seed, sample_variants = FALSE,
    donor_size = NULL, beta_shapes = NULL,
    n_cell = 5000, doublet_rate = NULL) {
    K <- ncol(GT) # number of clones
    N <- nrow(GT) # number of variants
    if (is.null(beta_shapes)) {
        # beta_shapes <- matrix(c(0.3, 3, 29.7, 29.7, 3, 0.3), nrow = 3)
        beta_shapes <- matrix(c(0.2, 0.1, 99.8, 99.8, 0.1, 0.2), nrow = 3)
    }
    if (is.null(doublet_rate)) {
        doublet_rate <- n_cell / 100000
        print(paste("doublet rate:", doublet_rate))
    }
    if (is.null(donor_size)) {
        donor_size <- rep(1 / K, K)
    }

    ## generate donor id matrix
    n_doublet <- round(doublet_rate * n_cell)
    I_sim <- t(rmultinom(n_cell + n_doublet, 1, prob = donor_size))

    ## generate D
    if (sample_variants) {
        D_sim <- D_seed[sample(nrow(D_seed), N, replace = TRUE), ]
    } else {
        D_sim <- D_seed
    }
    D_sim <- D_sim[, sample(ncol(D_sim), nrow(I_sim), replace = TRUE)]
    D_sim <- Matrix::Matrix(D_sim, sparse = TRUE)

    ## generate A with beta binomial
    A_sim <- D_sim
    H_sim <- GT %*% t(I_sim)
    for (ii in seq_len(3)) {
        val2 <- beta_shapes[ii, 1] + beta_shapes[ii, 2]
        val1 <- beta_shapes[ii, 1] / val2

        idx <- which(as.matrix(D_sim) > 0 & H_sim == (ii - 1))
        A_sim[idx] <- VGAM::rbetabinom(length(idx),
            size = D_sim[idx],
            prob = val1, rho = 1 / (1 + val2)
        )
    }

    ## pooling doublet
    idx_doublet1 <- sample(n_cell, n_doublet)
    idx_doublet2 <- seq(n_cell + 1, n_cell + n_doublet)
    A_sim[, idx_doublet1] <- A_sim[, idx_doublet1] + A_sim[, idx_doublet2]
    D_sim[, idx_doublet1] <- D_sim[, idx_doublet1] + D_sim[, idx_doublet2]
    I_sim[idx_doublet1, ] <- I_sim[idx_doublet1, ] + I_sim[idx_doublet2, ]
    A_sim <- A_sim[, seq_len(n_cell)]
    D_sim <- D_sim[, seq_len(n_cell)]
    I_sim <- I_sim[seq_len(n_cell), ]

    ## output
    colnames(I_sim) <- colnames(GT)
    row.names(A_sim) <- row.names(D_sim) <- row.names(GT)
    cell_names <- paste0("cell", seq_len(ncol(A_sim)))
    row.names(I_sim) <- colnames(A_sim) <- colnames(D_sim) <- cell_names

    return_list <- list(
        "A" = A_sim, "D" = D_sim,
        "I_sim" = I_sim, "GT" = GT
    )
    return_list
}
davismcc/cardelino documentation built on Nov. 19, 2022, 2:44 a.m.