R/estimateDispersion.R

# Estimate Disperion for all Labeled and Total samples
# 
# Author: demel
###############################################################################


#' Estimates gene-specific dispersion estimates for total and labeled samples,
#' using DESeq2
#' The DESeq estimation does only work correctly, if at least two samples are
#' available for each class(labeled/total),
#' either replicates or different conditions.
#' 
#' @param counts count table for genes (rows) and samples (columns)
#' @param conditions.labeling vector indicating for each sample if it was 
#' labeled ("L") or total ("T")
#' @param conditions vector giving the experimental condition for each sample
#' @param type type of disperion from deseq, "dispersion"=="dispMAP"
#' @return data.frame with dispersion estimates for every gene (rows) and 
#' labeled and total samples(columns)
#' 
#' @importFrom DESeq2 DESeqDataSetFromMatrix
#' @importFrom DESeq2 DESeq
#' 
#' @examples 
#' data(gene.counts)
#' data(samples)
#' dispersion = estimateGeneDispersion(gene.counts, samples$conditions.labeling,
#' samples$conditions)
#' 
#' @author Carina Demel
#' @export
estimateGeneDispersion <- function(
		counts,
		conditions.labeling,
		conditions = colnames(counts),
		type = c("dispersion", "dispGeneEst", "dispFit", "dispMAP")
){

		expDesign.allgene <- data.frame( row.names = colnames(counts),
				labeling = factor(conditions.labeling), 
				condition = factor(conditions))
		
		expDesign.total <- subset(expDesign.allgene, labeling == "T")
		message(paste(
			"For the Total gene dispersion, the following samples are used:", 
			paste(rownames(expDesign.total), collapse = ", ")))
		
		deseq.matrix.total <- DESeqDataSetFromMatrix(
				counts[, conditions.labeling == "T"],
				colData = expDesign.total,
				design = formula(~ condition))
		dds.total <- suppressMessages(DESeq(deseq.matrix.total))
		
		expDesign.labeled <- subset(expDesign.allgene, labeling == "L")
		message(paste(
			"For the Labeled gene dispersion, the following samples are used:",
			paste(rownames(expDesign.labeled), collapse = ", ")))
		
		deseq.matrix.labeled <- DESeqDataSetFromMatrix(
				counts[,conditions.labeling == "L"],
				colData = expDesign.labeled,
				design = formula(~ condition))
		dds.labeled <- suppressMessages(DESeq(deseq.matrix.labeled))
	
	suppressWarnings(
		if (type == "dispGeneEst") {
			dispersion.labeled <- mcols(dds.labeled)$dispGeneEst
			dispersion.total <- mcols(dds.total)$dispGeneEst
		} else if ( type == "dispFit" ) {
			dispersion.labeled <- mcols(dds.labeled)$dispFit
			dispersion.total <- mcols(dds.total)$dispFit
		} else {
			dispersion.labeled <- mcols(dds.labeled)$dispersion
			dispersion.total <- mcols(dds.total)$dispersion
		}
	)
	# in cases where no dispersion could be fitted because of missing values,
	# set dispersion to median gene dispersion
	if (sum(is.na(dispersion.labeled)) > 0) {
		message(paste("Some feature-specific dispersions could not be",
						"estimated (because of 0 counts) and were set to",
						"the median dispersion value of all features."))
	}
	dispersion.labeled[is.na(dispersion.labeled)] <- 
			median(dispersion.labeled, na.rm=TRUE)
	dispersion.total[is.na(dispersion.total)] <- 
			median(dispersion.total, na.rm=TRUE)
	
	dispersion <- cbind("L" = dispersion.labeled, "T" = dispersion.total)
	rownames(dispersion) <- rownames(dds.labeled)
	return(dispersion)
	
}
carinademel/RNAlife documentation built on May 13, 2019, 12:43 p.m.