# Estimate sample specific parameters such as cross contamination rate and
# sequencing depth based on the counts of artificial spike-ins that were
# added to Labeled and Total RNA-Seq experiments
#
# Author: demel
###############################################################################
#' function to estimate normalization parameters based on spike-in counts
#' with GLM using offset length, seq depth per sample and cross-contamination
#' control AND a spike-specific parameter to control for e.g.sequence bias
#'
#' @param counts matrix with counts for each spike (row) and each sample
#' (column)
#' @param spikeins names of spike-ins, in same order as rownames of counts
#' @param spikein.lengths length of spike-ins
#' @param spikein.labeling labeling status of spike-ins (vector consisting of
#' "L" and "U")
#' @param samples individual name for each sample, e.g. colnames of count table
#' @param conditions.labeling labeling status of sample (vector consisting of
#' "L" and "T")
#' @param paired is there for every labeled sample also a total sample and vice
#' versa? Only then sequencing depths can be estimated. Else they will all be
#' set to 1.
#' @param debug should debugging modus be used?
#' @return list consisting of fitting-results in a data.frame and vector of
#' fitted.counts
#'
#' @examples
#' data(spikein.counts)
#' spikeins = rownames(spikein.counts)
#' data(spikein.labeling)
#' data(spikein.lengths)
#' data(samples)
#' norm.res = spikein.normalization(spikein.counts, spikeins, spikein.lengths,
#' spikein.labeling, samples=colnames(spikein.counts),
#' samples$conditions.labeling)
#' norm.res$sequencing.depth
#' norm.res$cross.contamination
#'
#' @importFrom MASS glm.nb
#' @author Carina Demel
#' @export
spikein.normalization <- function(
counts,
spikeins=rownames(counts),
spikein.lengths,
spikein.labeling,
samples=colnames(counts),
conditions.labeling,
paired=TRUE,
debug=FALSE
){
if (debug) browser()
if (!is.null(names(conditions.labeling)) &
all(names(conditions.labeling) %in% samples))
conditions.labeling <- conditions.labeling[samples]
if (all(names(spikein.lengths) %in% spikeins))
spikein.lengths <- spikein.lengths[spikeins]
if (all(names(spikein.labeling) %in% spikeins))
spikein.labeling <- spikein.labeling[spikeins]
counts <- as.matrix(counts)
### new in v0.6
if (all(rownames(counts) %in% spikeins))
counts <- counts[spikeins, ]
if (all(colnames(counts) %in% samples))
counts <- counts[, samples]
###
mat <- spikein.dataframe(counts = counts, spikeins = spikeins,
spikein.lengths = spikein.lengths,
spikein.labeling = spikein.labeling,
samples = samples, conditions.labeling = conditions.labeling)
if (paired==TRUE) {
nb_model <- glm.nb(counts ~ offset(log.length) + sample + ccc + spike,
data=mat, link="log")
fitted.counts <- nb_model$fitted.values
mat$fitted.counts <- fitted.counts
coefs <- coefficients(nb_model)
intercept <- coefs[1]
seq.depths <- exp(coefs[grep("sample", names(coefs))])
names(seq.depths) <- sub("sample", "", names(seq.depths))
seq.depths <- c(1, seq.depths) #reference sample is 0, exp(0)=1
names(seq.depths)[1] <- setdiff(samples, names(seq.depths))
seq.depths <- seq.depths[samples]
cross.cont.L <- exp(coefs[grep("cccL", names(coefs))])
cross.cont <- rep(0, length(samples))
names(cross.cont) <- samples
cross.cont[conditions.labeling == "L"] <- cross.cont.L
cross.cont[conditions.labeling == "T"] <- 1
cross.cont <- cross.cont[samples]
spike.specific.bias <- c(1,exp(coefs[grep("spike",names(coefs))]))
names(spike.specific.bias) <- sort(spikeins)
} else {
#not always total and labeled data per sample,
#then don't estimate sequencing depth
cross.cont <- colSums(counts[spikein.labeling == "U", ]) /
colSums(counts[spikein.labeling == "L", ])
cs <- colSums(counts)
seq.depths <- cs/(cs[1])
names(cross.cont) <- samples
names(seq.depths) <- samples
intercept <- NA
spike.specific.bias <- rep(NA, length(spikeins))
names(spike.specific.bias) <- sort(spikeins)
fitted.counts <- rep(NA, nrow(mat))
}
results <- data.frame(param = c(
"intercept",
rep("seq.depth", length(seq.depths)),
rep("cross.cont", length(cross.cont)),
rep("spike.specific.bias", length(spike.specific.bias))),
sample = c(
"all",
names(seq.depths),
names(cross.cont),
names(spike.specific.bias)),
values = c(intercept, seq.depths, cross.cont, spike.specific.bias))
res.list <- list(
"results" = results,
"intercept" = intercept,
"sequencing.depth" = seq.depths,
"cross.contamination" = cross.cont,
"spike.specific.bias" = spike.specific.bias,
"fitted.counts" = fitted.counts
)
return(res.list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.