R/EM.R
In bisect: Estimating Cell Type Composition from Methylation Sequencing Data

Documented in bisect_semi_suprevised bisect_supervised

E_1_step <- function(P, Pi) {
    N <- nrow(P)
    J <- nrow(Pi)
    K <- ncol(P)

    E <- array(dim = c(N, J, K))

    constants <- tcrossprod(P, Pi)  #  P %*% t(Pi)
    for (k in 1:K) {
        E[, , k] <- outer(P[, k], Pi[, k])/constants
    }

    return(E)
}

E_0_step <- function(P, Pi) {
    N <- nrow(P)
    J <- nrow(Pi)
    K <- ncol(P)

    E <- array(dim = c(N, J, K))

    constants <- tcrossprod(P, 1 - Pi)  # P %*% (1-t(Pi))
    for (k in 1:K) {
        E[, , k] <- outer(P[, k], 1 - Pi[, k])/constants
    }

    return(E)
}

Maximize_P <- function(E_0, E_1, R, Y, reads_i, alpha) {
    K <- dim(E_0)[3]
    constants <- reads_i + sum(alpha) - K

    N <- nrow(R)

    result <- matrix(nrow = N, ncol = K)

    for (k in 1:K) {
        nominator <- rowSums(E_1[, , k] * Y) + rowSums(E_0[, , k] * (R - Y)) + alpha[k] - 1
        result[, k] <- nominator/constants
    }

    return(result)
}

Maximize_Pi <- function(E_0, E_1, Y, R) {
    J <- ncol(Y)
    K <- dim(E_0)[3]
    result <- matrix(nrow = J, ncol = K)
    for (k in 1:K) {
        nominator <- colSums(Y * E_1[, , k])
        denominator <- colSums(Y * E_1[, , k]) + colSums((R - Y) * E_0[, , k])

        result[, k] <- nominator/denominator
    }

    return(result)
}


## R - a matrix of individuals (rows) on sites (columns), containing the total number of reads for each site, in each individual.
## Y - a matrix of individuals (rows) on sites (columns), containing the number of methylated reads for each site, in each individual.
## Pi - A matrix of sites (rows) on cell types (columns), containing the probability for methylation in each site, in each cell type.  If you need a random Pi matrix use
## 'initialize.Pi()'.  P - A matrix of individuals (rows) on cell types (columns), consisted of the estimated proportion of each cell type in each
## individual.  The top n_known_samples row should be the cell proportions of the known individuals (if any), and the rest should be initialized with
## initial.P().  n_known_samples: an integer, the number of known individuals.  If not known, initialize to a random value between 0 and 1.  alpha - a
## list of length K (amount of cell types), containing the hyper-paramters of the dirichlet prior. If None, a non-informative prior is used.
run_EM <- function(R, Y, Pi, P, alpha, n_known_samples = 0, iterations = 200, estimate_Pi = F, estimate_P = T) {

    minimum_cell_proportion <- 0.001

    # The samples to estimate + the known samples.
    reads_i <- rowSums(R)
    progress_bar <- txtProgressBar(min = 0, max = iterations)
    for (i in 1:iterations) {
        E_0 <- E_0_step(P, Pi)
        E_1 <- E_1_step(P, Pi)

        if (estimate_P) {
            maximize_P_results <- Maximize_P(E_0, E_1, R, Y, reads_i, alpha = alpha)
            if (n_known_samples > 0) {
                P[-(1:n_known_samples), ] <- maximize_P_results[-(1:n_known_samples), ]
            } else {
                P <- maximize_P_results
            }

            ## make sure that P doesn't leave allowed range to prevent numeric errors.
            P[P < 0] <- minimum_cell_proportion
            P[P > 1] <- 1 - minimum_cell_proportion
        }

        if (estimate_Pi) {
            Pi <- Maximize_Pi(E_0, E_1, Y, R)
        }

        setTxtProgressBar(progress_bar, i)
    }

    return(list(P = P, Pi = Pi))
}

initialize_P <- function(n_individuals, n_cell_types, alpha) {
    initial_P <- rep(alpha/sum(alpha), n_individuals)
    initial_P <- matrix(initial_P, nrow = n_individuals, byrow = T)

    return(initial_P)
}

#' @importFrom stats runif
initialize_Pi <- function(n_sites, n_cell_types) {
    Pi <- runif(n_sites * n_cell_types)
    Pi <- matrix(Pi, nrow = n_sites)

    return(Pi)
}


### Exported Functions ---------------------------------------------------
#' Add together two numbers.
#'
#' @param methylation a matrix of individuals (rows) on sites (columns), containing the number of methylated reads for each site, in each individual.
#' @param total_reads a matrix of individuals (rows) on sites (columns), containing the total number of reads for each site, in each individual.
#' @param reference a matrix of sites (rows) on cell types (columns), containing the probability for methylation in each site, in each cell type.
#' @param alpha a vector containing the hyper-parameters for the dirichelt prior. One value for each cell type. If NA, it is initiallized to 1/(number of cell types).
#' @param iterations the number of iterations to use in the EM algorithm.
#' @return A matrix of individuals (rows) on cell types (columns) containing the estimated proportion of each cell type, in each individual.
#' @examples
#' ## Prepare the methylation and total reads matrices
#' methylation <- as.matrix(methylation_GSE40279)
#' total_reads <- as.matrix(total_reads_GSE40279)
#' ## Remove the IDs column from the reference
#' Pi <- as.matrix(reference_blood[,-1])
#'
#' ## Run Bisect. You should use around 200 iterations. I choose than to accelarate the example.
#' results <- bisect_supervised(methylation, total_reads, Pi, alpha_blood, iterations = 10)
#' @importFrom utils txtProgressBar
#' @importFrom utils setTxtProgressBar
#' @export
bisect_supervised <- function(methylation, total_reads, reference, alpha = NA, iterations = 200) {
    N <- nrow(methylation)
    J <- ncol(methylation)
    K <- ncol(reference)

    if (length(alpha) == 1) {
        if (is.na(alpha)) {
            alpha <- rep(1, K)
        }
    }

    initial_P <- initialize_P(N, J, alpha)

    results <- run_EM(total_reads, methylation, reference, initial_P, alpha, iterations = iterations, estimate_P = T)

    return(results$P)
}

#' Add together two numbers.
#'
#' @param methylation_unkown_samples a matrix of individuals (rows) on sites (columns), containing the number of methylated reads for each site, in each individual for the samples with unknown cell composition.
#' @param total_reads_unknown_samples a matrix of individuals (rows) on sites (columns), containing the total number of reads for each site, in each individual for the samples with unknown cell composition.
#' @param methylation_known_samples a matrix of individuals (rows) on sites (columns), containing the number of methylated reads for each site, in each individual for the samples with known cell composition.
#' @param total_reads_known_samples a matrix of individuals (rows) on sites (columns), containing the total number of reads for each site, in each individual for the samples with known cell composition.
#' @param cell_composition_known_samples a matrix of individuals (rows) on cell types (columns), containing the proportion of each cell type, in each known sample.
#' @param alpha a vector containing the hyper-parameters for the dirichelt prior. One value for each cell type. If NA, it is initiallized to 1/(number of cell types).
#' @param iterations the number of iterations to use in the EM algorithm.
#' @return A list containing P, a matrix of estimated cell proportions for the unknown samples, and Pi, an estimated reference (the probability of methylation in each cell type).
#' @examples
#' ## Randomly choose samples to be used as known
#' n_known_samples <- 50
#' known_samples_indices <- sample.int(nrow(baseline_GSE40279), size = n_known_samples)
#' known_samples <- as.matrix(baseline_GSE40279[known_samples_indices, ])
#'
#' ## Fit a dirichlet distribution to known samples to use as prior
#' fit_dirichlet <- sirt::dirichlet.mle(as.matrix(known_samples))
#' alpha <- fit_dirichlet$alpha
#'
#' ## Prepare the 4 needed matrices
#' methylation_known <- methylation_GSE40279[known_samples_indices, ]
#' methylation_unknown <-methylation_GSE40279[-known_samples_indices, ]
#' total_known <- total_reads_GSE40279[known_samples_indices, ]
#' total_unknown <- total_reads_GSE40279[-known_samples_indices, ]
#'
#' ## Run Bisect. You should use around 200 iterations. I choose than to accelarate the example.
#' results <- bisect_semi_suprevised(methylation_unknown, total_unknown,
#'                                   methylation_known, total_known,
#'                                   known_samples, alpha, iterations = 10)
#' @export
bisect_semi_suprevised <- function(methylation_unkown_samples, total_reads_unknown_samples, methylation_known_samples, total_reads_known_samples, cell_composition_known_samples,
    alpha = NA, iterations = 200) {
    N_unknown <- nrow(methylation_unkown_samples)
    J <- ncol(methylation_unkown_samples)
    K <- ncol(cell_composition_known_samples)
    N_known <- nrow(methylation_known_samples)

    if (length(alpha) == 1) {
        if (is.na(alpha)) {
            alpha <- rep(1, K)
        }
    }

    ## Estimate a starting point for Pi using only known samples
    print("estimating reference .........")

    initial_Pi <- initialize_Pi(J, K)
    results_known_samples <- run_EM(R = total_reads_known_samples, Y = methylation_known_samples, Pi = initial_Pi, P = cell_composition_known_samples,
        alpha = alpha, n_known_samples = N_known, iterations = iterations, estimate_Pi = T, estimate_P = F)

    initial_Pi <- results_known_samples$Pi

    ## Estimate P and Pi together, use the inital Pi from before as a starting point
    unknown_P <- initialize_P(N_unknown, J, alpha)
    initial_P <- rbind(cell_composition_known_samples, unknown_P)

    Y <- rbind(methylation_known_samples, methylation_unkown_samples)
    R <- rbind(total_reads_known_samples, total_reads_unknown_samples)

    n_known_samples <- nrow(methylation_known_samples)

    print("estimating cell composition .........")
    results <- run_EM(R, Y, initial_Pi, initial_P, alpha, n_known_samples = n_known_samples, estimate_Pi = T, iterations = iterations)

    ## Only return P for unknown samples
    results$P <- results$P[-(1:N_known), ]
    return(results)
}

Any scripts or data that you put into this service are public.

bisect documentation built on May 2, 2019, 9:20 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

bisect
Estimating Cell Type Composition from Methylation Sequencing Data

R/EM.R
In bisect: Estimating Cell Type Composition from Methylation Sequencing Data

Defines functions E_1_step E_0_step Maximize_P Maximize_Pi run_EM initialize_P initialize_Pi bisect_supervised bisect_semi_suprevised

Documented in bisect_semi_suprevised bisect_supervised

Try the bisect package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bisect Estimating Cell Type Composition from Methylation Sequencing Data

R/EM.R In bisect: Estimating Cell Type Composition from Methylation Sequencing Data

Defines functions E_1_step E_0_step Maximize_P Maximize_Pi run_EM initialize_P initialize_Pi bisect_supervised bisect_semi_suprevised

Documented in bisect_semi_suprevised bisect_supervised

Try the bisect package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bisect
Estimating Cell Type Composition from Methylation Sequencing Data

R/EM.R
In bisect: Estimating Cell Type Composition from Methylation Sequencing Data