# Function and helper function to estimate synthesis (mu) and degradation
# (lambda) rates based on labeled (alpha) and unlabeled (beta) amounts of RNA
#
# Author: demel
###############################################################################
#' Estimate best alpha = labeled amount and best beta = unlabeled amount of one
#' gene (either for one or multiple samples).
#' Called in estimateAlphaBeta.
#'
#' @param counts observed counts for a gene (in labeled and total fraction)
#' @param labeled.samples index of labeled sample(s) for which alpha and beta
#' should be estimated
#' @param total.samples index of total sample(s) for which alpha and beta
#' should be estimated
#' @param L length of the gene
#' @param N number of cells
#' @param epsilon vector with cross contamination rate values for Labeled and
#' Total fraction
#' @param sigma vector with sequencing depth values in Labeled and Total
#' fraction
#' @param disp vector with gene-specific disperion estimates (alpha) for
#' Labeled and Total sample to be used in Negative Binomial
#' @param alpha.initial initial alpha value
#' @param beta.initial initial beta value
#' @param maxit maximum number of iterations for the optim function
#' @param trace logical; should all steps be reported
#' @param logloglik should log-loglikelihood be used? (might be more stable
#' against outliers)
#' @param debug should debugging modus be used?
#' @return output of optim (a list)
#'
#' @author Carina Demel
fitAlphaBeta.log.reparam <- function(
counts,
labeled.samples,
total.samples,
L,
N,
epsilon,
sigma,
disp,
alpha.initial,
beta.initial,
maxit = 10000,
trace = FALSE,
logloglik = TRUE,
debug = FALSE
){
if (debug)
browser()
#due to reparametrization adjust initial guess values
alpha.initial <- log(alpha.initial)
beta.initial <- log(beta.initial)
cost <- function(
theta,
counts,
L,
N,
epsilon,
sigma,
disp,
logloglik,
labeled.samples,
total.samples
){
l <- theta[1]
u <- theta[2]
labeled.disp <- disp[1]
total.disp <- disp[2]
#reparametrization
labeled.amount <- exp(l) # labeled amount alpha
unlabeled.amount <- exp(u) # unlabeled amount beta
exp.counts.L <- getExpectedCounts(L, labeled.amount, unlabeled.amount, N,
epsilon[labeled.samples], sigma[labeled.samples])
exp.counts.T <- getExpectedCounts(L, labeled.amount, unlabeled.amount, N,
epsilon[total.samples], sigma[total.samples])
neg.LL <- -sum(dnbinom(x = counts[labeled.samples], size = labeled.disp,
mu = exp.counts.L, log = TRUE)) -
sum(dnbinom(x = counts[total.samples], size = total.disp,
mu = exp.counts.T, log = TRUE))
if (logloglik) {
res <- log(neg.LL)
} else {
res <- neg.LL
}
return(res)
}
grad = function(
theta,
counts,
L,
N,
epsilon,
sigma,
disp,
logloglik,
labeled.samples,
total.samples
){
l <- theta[1]
u <- theta[2]
samples <- c(labeled.samples, total.samples)
#reparametrization
labeled.amount <- exp(l) # labeled amount alpha
unlabeled.amount <- exp(u) # unlabeled amount beta
exp.counts <- N * L * sigma[samples] *
(labeled.amount + epsilon[samples] * unlabeled.amount)
if (length(samples) > 2) { # replicates
disp <- c(rep(disp[1], length(labeled.samples)),
rep(disp[2], length(total.samples)))
}
# vectorial solution
d_a <- - sum((counts[samples] - exp.counts) * N * L * sigma[samples] *
labeled.amount/(exp.counts * (1 + exp.counts / disp)))
d_b <- - sum((counts[samples] - exp.counts) * N * L * sigma[samples] *
(epsilon[samples] * unlabeled.amount) /
(exp.counts * (1 + exp.counts / disp)) )
if (logloglik) {
neg.LL <- -sum(dnbinom(x = counts[samples], size = disp,
mu = exp.counts, log = TRUE))
res <- c(d_a, d_b) / neg.LL
} else {
res <- c(d_a, d_b)
}
return(res)
}
#minimize negative log likelihood == maximize likelihood
fit <- optim(c(alpha.initial, beta.initial), cost, gr = grad, counts = counts,
L = L, N = N, epsilon = epsilon, sigma = sigma, disp = disp,
control = list(maxit = maxit, trace = trace), method="BFGS",
logloglik = logloglik, labeled.samples = labeled.samples,
total.samples = total.samples)
return(fit)
}
#' Wrapper function to estimate gene- and time specific concentrations for
#' labeled (alpha) and unlabeled (beta) RNA for whole count table
#' The initial values for alpha and beta can be provided, but are otherwise
#' estimated from the data. In case the estimation yields negative or too small
#' values they are replaced by alphabeta.min.
#' For conditions, where no counts are available in both Labeled and Total
#' samples, no rate estimation takes place and NA values are returned.
#'
#' @param gene.indices indices of genes that should be used for calculation
#' (in order to run code on smaller subset)
#' @param counts count table, counts per gene (rows) and sample (columns)
#' @param dispersion matrix or dataframe with gene-specific disperion estimates
#' (alpha) for labeled and total sample to be used in Negative Binomial.
#' Rownames should correspond to rownames in count matrix
#' @param lengths gene lengths, should be in same order as rownames of count
#' table
#' @param conditions vector of time points/treatment of experiment for each
#' sample
#' @param conditions.labeling vector indicating for each sample if it was
#' labeled RNA or total RNA
#' @param replicates replicate number of each sample
#' @param cross.cont vector with cross contamination rate values for each
#' sample (total RNAseq cross.cont=1)
#' @param seq.depths vector with sequencing depth values for each sample
#' @param lab.time vector of same length as unique conditions, indicating
#' labeling duration
#' @param N number of cells
#' @param consider.replicates should - if available - be replicates taken into
#' account or not? If not, rep replicates is only considered for estimation
#' @param rep if no replicate is available this is replicate that should be
#' chosen
#' @param logloglik should log-loglikelihood function be used?
#' @param alpha.initial initial value for alpha, can be numeric (same for all
#' genes and timepoints), a vector (same for all timepoints of one gene), or a
#' matrix (individual for each gene and time point)
#' @param beta.initial initial value for beta, can be numeric (same for all
#' genes and timepoints), a vector (same for all timepoints of one gene), or a
#' matrix (individual for each gene and time point)
#' @param alphabeta.min minimum value as initial value for alpha and beta rates
#' @param debug should debugging modus be used?
#' @return estimates for alpha and beta, expected counts in Labeled and Total
#' and loss for each gene and each labeling time point
#'
#' @examples
#' data(gene.counts)
#' lengths = rnbinom(nrow(gene.counts), mu=1000, size=10)
#' data(spikein.counts)
#' data(spikein.labeling)
#' data(spikein.lengths)
#' spikeins = rownames(spikein.counts)
#' data(samples)
#' norm.res = spikein.normalization(spikein.counts, spikeins, spikein.lengths,
#' spikein.labeling,
#' samples=colnames(spikein.counts), samples$conditions.labeling)
#' seq.depths = norm.res$sequencing.depth
#' cross.cont = norm.res$cross.contamination
#' dispersion = estimateGeneDispersion(gene.counts, samples$conditions.labeling,
#' samples$conditions)
#' fittingres = estimateAlphaBeta.genes(1:12, gene.counts, dispersion, lengths,
#' samples$conditions, samples$conditions.labeling, samples$replicates,
#' cross.cont, seq.depths, N=1, consider.replicates=TRUE, lab.time=c(5,5),
#' alpha.initial=1, beta.initial=1)
#'
#' @author Carina Demel
#' @export
estimateAlphaBeta.genes <- function(
gene.indices = NULL,
counts,
dispersion,
lengths,
conditions,
conditions.labeling,
replicates,
cross.cont,
seq.depths,
lab.time,
N = 1,
consider.replicates = TRUE,
rep = 1,
logloglik = TRUE,
alpha.initial = NULL,
beta.initial = NULL,
alphabeta.min = 1e-4,
debug = FALSE
){
if (debug)
browser()
if (is.null(gene.indices))
gene.indices <- 1:nrow(counts)
counts <- as.matrix(counts)
uniqueConditions <- unique(conditions)
replicates.available <- ifelse(table(paste(conditions.labeling,
conditions))[paste(conditions.labeling, conditions)] > 1,
TRUE, FALSE)
rep.vec <- ((replicates.available & replicates == rep) |
!replicates.available)
if (any( replicates.available == FALSE & replicates != rep ))
message(paste("For some of the conditions no replicates are available,",
"then your selection for the replicate number is not considered",
"in those cases."))
res <- sapply(gene.indices, function(i){
sapply(1:length(uniqueConditions), function(t){
# at timepoints where we have replicates for L, select Replicate "rep"
if (!consider.replicates) {
labeled.samples <- which(conditions == uniqueConditions[t] &
conditions.labeling == "L" & rep.vec)
total.samples <- which(conditions == uniqueConditions[t] &
conditions.labeling == "T" & rep.vec)
} else {
labeled.samples <- which(conditions == uniqueConditions[t] &
conditions.labeling == "L")
total.samples <- which(conditions == uniqueConditions[t] &
conditions.labeling == "T")
}
samples <- c(labeled.samples, total.samples)
nl <- length(labeled.samples)
nt <- length(total.samples)
if (i == gene.indices[1]) {
message(paste("The following labeled and total samples are used",
"for synthesis and degradation rates of condition:",
uniqueConditions[t]))
message(paste("Labeled samples: Columns:",
paste(labeled.samples,collapse=","), "Col.names:",
paste(colnames(counts)[labeled.samples],collapse=", ")))
message(paste("Total samples: Columns:",
paste(total.samples,collapse=","), "Col.names:",
paste(colnames(counts)[total.samples],collapse=", ")))
}
if (any(counts[i, samples] > 0) & all(!is.na(dispersion[i, ]))) {
if (is.null(alpha.initial)) {
alpha.initial.g <- 1 / (N * lengths[i]) *
1 / (nl*sum(cross.cont[total.samples]) -
nt * sum(cross.cont[labeled.samples])) *
(sum(cross.cont[total.samples]) *
sum(counts[i, labeled.samples] / seq.depths[labeled.samples]) -
sum(cross.cont[labeled.samples]) *
sum(counts[i, total.samples] / (seq.depths[total.samples])))
} else if (length(alpha.initial) == 1) {
alpha.initial.g <- alpha.initial
} else {
alpha.initial.g <- alpha.initial[i, t]
}
if (is.null(beta.initial)) {
beta.initial.g <- 1 / (N * lengths[i]) *
1 / (sum(cross.cont[total.samples]) / nt -
sum(cross.cont[labeled.samples]) / nl) *
( sum(counts[i, total.samples] /
(nt * seq.depths[total.samples])) -
sum(counts[i, labeled.samples] /
(nl * seq.depths[labeled.samples])))
} else if (length(beta.initial) == 1) {
beta.initial.g <- beta.initial
} else {
beta.initial.g <- beta.initial[i, t]
}
# set alpha and beta to a minimum value, so that the estimation on log
# scalecan be performed. (should be non-negative alpha and beta values)
alpha.initial.g <- max(alpha.initial.g, alphabeta.min)
beta.initial.g <- max(beta.initial.g, alphabeta.min)
tryCatch({
fab = fitAlphaBeta.log.reparam(counts[i, ], labeled.samples,
total.samples, lengths[i], N, epsilon = cross.cont,
sigma = seq.depths,
disp=dispersion[i, ], alpha.initial = alpha.initial.g,
beta.initial = beta.initial.g)
return(c(exp(fab$par), fab$value, alpha.initial.g, beta.initial.g))
}, error=function(e) {
return(c(NA, NA, Inf, alpha.initial.g, beta.initial.g))
})
}else{
return(c(NA, NA, Inf, NA, NA))
}
})
})
alpha <- t(res[seq(1, nrow(res), 5), , drop=FALSE])
beta <- t(res[seq(1, nrow(res), 5) + 1, , drop=FALSE])
loss <- t(res[seq(1, nrow(res), 5) + 2, , drop=FALSE])
alpha.initial <- t(res[seq(1, nrow(res), 5) + 3, , drop=FALSE])
beta.initial <- t(res[seq(1, nrow(res), 5) + 4, , drop=FALSE])
if (consider.replicates) {
#take all available replicates into account
labeled.samples <- which(conditions.labeling == "L")
total.samples <- which(conditions.labeling == "T")
} else {
labeled.samples <- which(conditions.labeling == "L" & rep.vec)
total.samples <- which(conditions.labeling == "T" & rep.vec)
}
match.L <- match(conditions[labeled.samples], uniqueConditions)
match.T <- match(conditions[total.samples], uniqueConditions)
exp.counts.L <- getExpectedCounts(lengths[gene.indices], alpha[, match.L],
beta[, match.L], N, cross.cont[labeled.samples],
seq.depths[labeled.samples])
exp.counts.T <- getExpectedCounts(lengths[gene.indices], alpha[, match.T],
beta[, match.T], N, cross.cont[total.samples], seq.depths[total.samples])
lambda <- -1 / lab.time * log(beta / (alpha + beta))
mu <- (alpha + beta) * lambda
colnames(alpha) = colnames(alpha.initial) = colnames(beta) =
colnames(beta.initial) = colnames(loss) = colnames(lambda) =
colnames(mu) = uniqueConditions
if (length(gene.indices) == 1) {
exp.counts.T <- t(exp.counts.T)
exp.counts.L <- t(exp.counts.L)
}
rownames(alpha) = rownames(alpha.initial) = rownames(beta) =
rownames(beta.initial) = rownames(loss) = rownames(exp.counts.L) =
rownames(exp.counts.T) = rownames(lambda) = rownames(mu) =
rownames(counts)[gene.indices]
colnames(exp.counts.L) <- conditions[labeled.samples]
colnames(exp.counts.T) <- conditions[total.samples]
hl <- log(2)/lambda
res.list <- list(
"alpha" = alpha,
"beta" = beta,
"loss" = loss,
"exp.counts.L" = exp.counts.L,
"exp.counts.T" = exp.counts.T,
"mu" = mu,
"lambda" = lambda,
"half.life" = hl,
"alpha.initial" = alpha.initial,
"beta.initial" = beta.initial
)
return(res.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.