R/fitRates.R

# Function and helper function to estimate synthesis (mu) and degradation 
# (lambda) rates based on labeled (alpha) and unlabeled (beta) amounts of RNA
# 
# Author: demel
###############################################################################


#' Estimate best alpha = labeled amount and best beta = unlabeled amount of one
#' gene (either for one or multiple samples).
#' Called in estimateAlphaBeta.
#'  
#' @param counts observed counts for a gene (in labeled and total fraction)
#' @param labeled.samples index of labeled sample(s) for which alpha and beta
#' should be estimated
#' @param total.samples index of total sample(s) for which alpha and beta
#' should be estimated
#' @param L length of the gene
#' @param N number of cells
#' @param epsilon vector with cross contamination rate values for Labeled and
#' Total fraction
#' @param sigma vector with sequencing depth values in Labeled and Total
#' fraction
#' @param disp vector with gene-specific disperion estimates (alpha) for
#' Labeled and Total sample to be used in Negative Binomial
#' @param alpha.initial initial alpha value
#' @param beta.initial initial beta value
#' @param maxit maximum number of iterations for the optim function
#' @param trace logical; should all steps be reported
#' @param logloglik should log-loglikelihood be used? (might be more stable
#' against outliers)
#' @param debug should debugging modus be used?
#' @return output of optim (a list)
#' 
#' @author Carina Demel
fitAlphaBeta.log.reparam <- function(
		counts,
		labeled.samples,
		total.samples,
		L,
		N,
		epsilon,
		sigma,
		disp,
		alpha.initial,
		beta.initial,
		maxit = 10000,
		trace = FALSE,
		logloglik = TRUE,
		debug = FALSE
){
	
	if (debug)
		browser()
	
	#due to reparametrization adjust initial guess values
	alpha.initial <- log(alpha.initial)
	beta.initial <- log(beta.initial)
	
	cost <- function(
			theta,
			counts,
			L,
			N,
			epsilon,
			sigma,
			disp,
			logloglik,
			labeled.samples,
			total.samples
	){
		l <- theta[1]
		u <- theta[2]
		labeled.disp <- disp[1]
		total.disp <- disp[2]
		
		#reparametrization
		labeled.amount <- exp(l) # labeled amount alpha
		unlabeled.amount <- exp(u) # unlabeled amount beta
		
		exp.counts.L <- getExpectedCounts(L, labeled.amount, unlabeled.amount, N,
				epsilon[labeled.samples], sigma[labeled.samples])
		exp.counts.T <- getExpectedCounts(L, labeled.amount, unlabeled.amount, N,
				epsilon[total.samples], sigma[total.samples])

		neg.LL <- -sum(dnbinom(x = counts[labeled.samples], size = labeled.disp,
										mu = exp.counts.L, log = TRUE)) -
						sum(dnbinom(x = counts[total.samples], size = total.disp,
										mu = exp.counts.T, log = TRUE)) 
		
		if (logloglik) {
			res <- log(neg.LL)
		} else {
			res <- neg.LL
		}
		return(res)
	}
	
	grad = function(
			theta,
			counts,
			L,
			N,
			epsilon,
			sigma,
			disp,
			logloglik,
			labeled.samples,
			total.samples
	){
		l <- theta[1]
		u <- theta[2]
		samples <- c(labeled.samples, total.samples)
	
		#reparametrization
		labeled.amount <- exp(l) # labeled amount alpha
		unlabeled.amount <- exp(u) # unlabeled amount beta
		
		exp.counts <- N * L * sigma[samples] *
				(labeled.amount + epsilon[samples] * unlabeled.amount)

		if (length(samples) > 2) { # replicates
			disp <- c(rep(disp[1], length(labeled.samples)), 
					rep(disp[2], length(total.samples)))
		}
		# vectorial solution
		d_a <- - sum((counts[samples] - exp.counts) * N * L * sigma[samples] * 
						labeled.amount/(exp.counts * (1 + exp.counts / disp)))
		d_b <- - sum((counts[samples] - exp.counts) * N * L * sigma[samples] *
						(epsilon[samples] * unlabeled.amount) / 
						(exp.counts * (1 + exp.counts / disp)) )

		if (logloglik) {
			neg.LL <- -sum(dnbinom(x = counts[samples], size = disp, 
							mu = exp.counts, log = TRUE))
			res <- c(d_a, d_b) / neg.LL
		} else {
			res <- c(d_a, d_b)
		}
		return(res)
	}
	
	#minimize negative log likelihood == maximize likelihood
	fit <- optim(c(alpha.initial, beta.initial), cost, gr = grad, counts = counts,
			L = L, N = N, epsilon = epsilon, sigma = sigma, disp = disp,
			control = list(maxit = maxit, trace = trace), method="BFGS",
			logloglik = logloglik, labeled.samples = labeled.samples,
			total.samples = total.samples)
	return(fit)
}



#' Wrapper function to estimate gene- and time specific concentrations for 
#' labeled (alpha) and unlabeled (beta) RNA for whole count table
#' The initial values for alpha and beta can be provided, but are otherwise 
#' estimated from the data. In case the estimation yields negative or too small
#' values they are replaced by alphabeta.min.
#' For conditions, where no counts are available in both Labeled and Total 
#' samples, no rate estimation takes place and NA values are returned.
#' 
#' @param gene.indices indices of genes that should be used for calculation 
#' (in order to run code on smaller subset)
#' @param counts count table, counts per gene (rows) and sample (columns)
#' @param dispersion matrix or dataframe with gene-specific disperion estimates
#'  (alpha) for labeled and total sample to be used in Negative Binomial. 
#' Rownames should correspond to rownames in count matrix
#' @param lengths gene lengths, should be in same order as rownames of count 
#' table
#' @param conditions vector of time points/treatment of experiment for each 
#' sample
#' @param conditions.labeling vector indicating for each sample if it was 
#' labeled RNA or total RNA
#' @param replicates replicate number of each sample
#' @param cross.cont vector with cross contamination rate values for each 
#' sample (total RNAseq cross.cont=1)
#' @param seq.depths vector with sequencing depth values for each sample
#' @param lab.time vector of same length as unique conditions, indicating 
#' labeling duration
#' @param N number of cells
#' @param consider.replicates should - if available - be replicates taken into 
#' account or not? If not, rep replicates is only considered for estimation
#' @param rep if no replicate is available this is replicate that should be 
#' chosen
#' @param logloglik should log-loglikelihood function be used?
#' @param alpha.initial initial value for alpha, can be numeric (same for all 
#' genes and timepoints), a vector (same for all timepoints of one gene), or a 
#' matrix (individual for each gene and time point)
#' @param beta.initial initial value for beta, can be numeric (same for all 
#' genes and timepoints), a vector (same for all timepoints of one gene), or a 
#' matrix (individual for each gene and time point)
#' @param alphabeta.min minimum value as initial value for alpha and beta rates
#' @param debug should debugging modus be used?
#' @return estimates for alpha and beta, expected counts in Labeled and Total 
#' and loss for each gene and each labeling time point
#' 
#' @examples 
#' data(gene.counts)
#' lengths = rnbinom(nrow(gene.counts), mu=1000, size=10)
#' data(spikein.counts)
#' data(spikein.labeling)
#' data(spikein.lengths)
#' spikeins = rownames(spikein.counts)
#' data(samples)
#' norm.res = spikein.normalization(spikein.counts, spikeins, spikein.lengths, 
#' spikein.labeling, 
#' 	samples=colnames(spikein.counts), samples$conditions.labeling)
#' seq.depths = norm.res$sequencing.depth
#' cross.cont = norm.res$cross.contamination
#' dispersion = estimateGeneDispersion(gene.counts, samples$conditions.labeling,
#'  samples$conditions)
#' fittingres = estimateAlphaBeta.genes(1:12, gene.counts, dispersion, lengths, 
#' 	samples$conditions, samples$conditions.labeling, samples$replicates, 
#'  cross.cont, seq.depths, N=1, consider.replicates=TRUE, lab.time=c(5,5),
#'  alpha.initial=1, beta.initial=1)
#' 
#' @author Carina Demel
#' @export
estimateAlphaBeta.genes <- function(
		gene.indices = NULL,
		counts,
		dispersion,
		lengths,
		conditions,
		conditions.labeling,
		replicates, 
		cross.cont,
		seq.depths,
		lab.time, 
		N = 1,
		consider.replicates = TRUE,
		rep = 1,
		logloglik = TRUE,
		alpha.initial = NULL,
		beta.initial = NULL,
		alphabeta.min = 1e-4,
		debug = FALSE
){

	if (debug)
		browser()
	if (is.null(gene.indices))
		gene.indices <- 1:nrow(counts)

	counts <- as.matrix(counts)
	uniqueConditions <- unique(conditions)
	replicates.available <- ifelse(table(paste(conditions.labeling, 
							conditions))[paste(conditions.labeling, conditions)] > 1,
			TRUE, FALSE)
	rep.vec <- ((replicates.available & replicates == rep) | 
				!replicates.available)
	if (any( replicates.available == FALSE & replicates != rep ))
		message(paste("For some of the conditions no replicates are available,",
		"then your selection for the replicate number is not considered",
		"in those cases."))
	


	res <- sapply(gene.indices, function(i){ 
		sapply(1:length(uniqueConditions), function(t){

			# at timepoints where we have replicates for L, select Replicate "rep"
			if (!consider.replicates) {
				labeled.samples <- which(conditions == uniqueConditions[t] &
								conditions.labeling == "L" & rep.vec)
				total.samples <- which(conditions == uniqueConditions[t] &
								conditions.labeling == "T" & rep.vec)
			} else {
				labeled.samples <- which(conditions == uniqueConditions[t] &
								conditions.labeling == "L")
				total.samples <- which(conditions == uniqueConditions[t] &
								conditions.labeling == "T")
			}
			samples <- c(labeled.samples, total.samples)
			nl <- length(labeled.samples)
			nt <- length(total.samples)
						
			if (i == gene.indices[1]) {
				message(paste("The following labeled and total samples are used",
								"for synthesis and degradation rates of condition:", 
								uniqueConditions[t]))
				message(paste("Labeled samples: Columns:",
								paste(labeled.samples,collapse=","), "Col.names:",
								paste(colnames(counts)[labeled.samples],collapse=", ")))
				message(paste("Total samples: Columns:",
								paste(total.samples,collapse=","), "Col.names:",
								paste(colnames(counts)[total.samples],collapse=", ")))
			}
			
			
			if (any(counts[i, samples] > 0) & all(!is.na(dispersion[i, ]))) {	
				if (is.null(alpha.initial)) {
					alpha.initial.g <- 1 / (N * lengths[i]) * 
							1 / (nl*sum(cross.cont[total.samples]) - 
								nt * sum(cross.cont[labeled.samples])) * 
							(sum(cross.cont[total.samples]) * 
								sum(counts[i, labeled.samples] / seq.depths[labeled.samples])	-
								sum(cross.cont[labeled.samples]) * 
								sum(counts[i, total.samples] / (seq.depths[total.samples])))
				} else if (length(alpha.initial) == 1) {
					alpha.initial.g <- alpha.initial
				} else {
					alpha.initial.g <- alpha.initial[i, t]
				}
							
				if (is.null(beta.initial)) {
					beta.initial.g <- 1 / (N * lengths[i]) * 
							1 / (sum(cross.cont[total.samples]) / nt - 
								sum(cross.cont[labeled.samples]) / nl) * 
							( sum(counts[i, total.samples] / 
												(nt * seq.depths[total.samples])) -
								sum(counts[i, labeled.samples] /
												(nl * seq.depths[labeled.samples])))
				} else if (length(beta.initial) == 1) {
					beta.initial.g <- beta.initial
				} else {
					beta.initial.g <- beta.initial[i, t]
				}
				# set alpha and beta to a minimum value, so that the estimation on log
				# scalecan be performed. (should be non-negative alpha and beta values)
				alpha.initial.g <- max(alpha.initial.g, alphabeta.min)
				beta.initial.g <- max(beta.initial.g, alphabeta.min)				
	
				tryCatch({
						fab = fitAlphaBeta.log.reparam(counts[i, ], labeled.samples, 
								total.samples, lengths[i], N, epsilon = cross.cont,
								sigma = seq.depths,
								disp=dispersion[i, ], alpha.initial = alpha.initial.g,
								beta.initial = beta.initial.g)
						return(c(exp(fab$par), fab$value, alpha.initial.g, beta.initial.g))
				}, error=function(e) {
					return(c(NA, NA, Inf, alpha.initial.g, beta.initial.g))
				})	
									
			}else{
				return(c(NA, NA, Inf, NA, NA))
			}
		})
	})

	alpha <- t(res[seq(1, nrow(res), 5), , drop=FALSE])
	beta <- t(res[seq(1, nrow(res), 5) + 1, , drop=FALSE])
	loss <- t(res[seq(1, nrow(res), 5) + 2, , drop=FALSE])
	alpha.initial <- t(res[seq(1, nrow(res), 5) + 3, , drop=FALSE])
	beta.initial <- t(res[seq(1, nrow(res), 5) + 4, , drop=FALSE])

	if (consider.replicates) {
		#take all available replicates into account
		labeled.samples <- which(conditions.labeling == "L")
		total.samples <- which(conditions.labeling == "T")
	} else {
		labeled.samples <- which(conditions.labeling == "L" & rep.vec)
		total.samples <- which(conditions.labeling == "T" & rep.vec) 
	}
	match.L <- match(conditions[labeled.samples], uniqueConditions)
	match.T <- match(conditions[total.samples], uniqueConditions)

	exp.counts.L <- getExpectedCounts(lengths[gene.indices], alpha[, match.L], 
			beta[, match.L], N, cross.cont[labeled.samples], 
			seq.depths[labeled.samples]) 	
	exp.counts.T <- getExpectedCounts(lengths[gene.indices], alpha[, match.T], 
			beta[, match.T], N, cross.cont[total.samples], seq.depths[total.samples])
	lambda <- -1 / lab.time * log(beta / (alpha + beta))
	mu <- (alpha + beta) * lambda
	
	colnames(alpha) = colnames(alpha.initial) = colnames(beta) = 
			colnames(beta.initial) = colnames(loss) = colnames(lambda) = 
			colnames(mu) = uniqueConditions
	if (length(gene.indices) == 1) {
		exp.counts.T <- t(exp.counts.T)
		exp.counts.L <- t(exp.counts.L)
	}
	rownames(alpha) = rownames(alpha.initial) = rownames(beta) = 
			rownames(beta.initial) = rownames(loss) = rownames(exp.counts.L) = 
			rownames(exp.counts.T) = rownames(lambda) = rownames(mu) = 
			rownames(counts)[gene.indices]
	
	colnames(exp.counts.L) <- conditions[labeled.samples]
	colnames(exp.counts.T) <- conditions[total.samples]

	hl <- log(2)/lambda
	res.list <-	list(
			"alpha" = alpha,
			"beta" = beta,
			"loss" = loss,
			"exp.counts.L" = exp.counts.L,
			"exp.counts.T" = exp.counts.T,
			"mu" = mu,
			"lambda" = lambda,
			"half.life" = hl,
			"alpha.initial" = alpha.initial,
			"beta.initial" = beta.initial
	)
	return(res.list)
}
carinademel/RNAlife documentation built on May 13, 2019, 12:43 p.m.