R/spikein.normalization.R

# Estimate sample specific parameters such as cross contamination rate and
# sequencing depth based on the counts of artificial spike-ins that were
# added to Labeled and Total RNA-Seq experiments
# 
# Author: demel
###############################################################################


#' function to estimate normalization parameters based on spike-in counts
#' with GLM using offset length, seq depth per sample and cross-contamination 
#' control AND a spike-specific parameter to control for e.g.sequence bias
#'  
#' @param counts matrix with counts for each spike (row) and each sample
#' (column)
#' @param spikeins names of spike-ins, in same order as rownames of counts
#' @param spikein.lengths length of spike-ins
#' @param spikein.labeling labeling status of spike-ins (vector consisting of
#' "L" and "U")
#' @param samples individual name for each sample, e.g. colnames of count table
#' @param conditions.labeling labeling status of sample (vector consisting of
#' "L" and "T")
#' @param paired is there for every labeled sample also a total sample and vice
#' versa? Only then sequencing depths can be estimated. Else they will all be
#' set to 1.
#' @param debug should debugging modus be used?
#' @return list consisting of fitting-results in a data.frame and vector of 
#' fitted.counts
#' 
#' @examples 
#' data(spikein.counts)
#' spikeins = rownames(spikein.counts)
#' data(spikein.labeling)
#' data(spikein.lengths)
#' data(samples)
#' norm.res = spikein.normalization(spikein.counts, spikeins, spikein.lengths,
#' 	spikein.labeling, samples=colnames(spikein.counts),
#'  samples$conditions.labeling)
#' norm.res$sequencing.depth
#' norm.res$cross.contamination
#' 
#' @importFrom MASS glm.nb
#' @author Carina Demel
#' @export
spikein.normalization <- function(
		counts,
		spikeins=rownames(counts),
		spikein.lengths,
		spikein.labeling,
		samples=colnames(counts),
		conditions.labeling,
		paired=TRUE,
		debug=FALSE
){
	
	if (debug) browser()
	if (!is.null(names(conditions.labeling)) & 
			all(names(conditions.labeling) %in% samples)) 
		conditions.labeling <- conditions.labeling[samples]
	if (all(names(spikein.lengths) %in% spikeins)) 
		spikein.lengths <- spikein.lengths[spikeins]
	if (all(names(spikein.labeling) %in% spikeins)) 
		spikein.labeling <- spikein.labeling[spikeins]
	counts <- as.matrix(counts)
	
	### new in v0.6
	if (all(rownames(counts) %in% spikeins)) 
		counts <- counts[spikeins, ]
	if (all(colnames(counts) %in% samples)) 
		counts <- counts[, samples]
	###
	
	mat <- spikein.dataframe(counts = counts, spikeins = spikeins, 
			spikein.lengths = spikein.lengths,
			spikein.labeling = spikein.labeling,
			samples = samples, conditions.labeling = conditions.labeling)

	if (paired==TRUE) {
		nb_model <- glm.nb(counts ~ offset(log.length) + sample + ccc + spike,
				data=mat, link="log")
		fitted.counts <- nb_model$fitted.values
		mat$fitted.counts <- fitted.counts
			
		coefs <- coefficients(nb_model)
		intercept <- coefs[1]
		
		seq.depths <- exp(coefs[grep("sample", names(coefs))]) 
		names(seq.depths) <- sub("sample", "", names(seq.depths))
		seq.depths <- c(1, seq.depths) #reference sample is 0, exp(0)=1
		names(seq.depths)[1] <- setdiff(samples, names(seq.depths))
		seq.depths <- seq.depths[samples]
		
		cross.cont.L <- exp(coefs[grep("cccL", names(coefs))])
		cross.cont <- rep(0, length(samples))
		names(cross.cont) <- samples
		cross.cont[conditions.labeling == "L"] <- cross.cont.L
		cross.cont[conditions.labeling == "T"] <- 1
		cross.cont <- cross.cont[samples]
		
		spike.specific.bias <- c(1,exp(coefs[grep("spike",names(coefs))]))
		names(spike.specific.bias) <- sort(spikeins)
		
	} else {
		#not always total and labeled data per sample,
		#then don't estimate sequencing depth
		cross.cont <- colSums(counts[spikein.labeling == "U", ]) / 
				colSums(counts[spikein.labeling == "L", ])
		cs <- colSums(counts)
		seq.depths <- cs/(cs[1])
		names(cross.cont) <- samples
		names(seq.depths) <- samples
		intercept <- NA
		spike.specific.bias <- rep(NA, length(spikeins))
		names(spike.specific.bias) <- sort(spikeins)
		fitted.counts <- rep(NA, nrow(mat))
	}
	results <- data.frame(param = c(
							"intercept", 
							rep("seq.depth", length(seq.depths)),
							rep("cross.cont", length(cross.cont)),
							rep("spike.specific.bias", length(spike.specific.bias))),
					sample = c(
							"all",
							names(seq.depths),
							names(cross.cont),
							names(spike.specific.bias)),
					values = c(intercept, seq.depths, cross.cont, spike.specific.bias))
	
	res.list <- list(
					"results" = results,
					"intercept" = intercept,
					"sequencing.depth" = seq.depths,
					"cross.contamination" = cross.cont,
					"spike.specific.bias" = spike.specific.bias,
					"fitted.counts" = fitted.counts
	)
	return(res.list)
}
carinademel/RNAlife documentation built on May 13, 2019, 12:43 p.m.