R/spikein.dataframe.R

# Build a dataframe with all information about spike-in counts per sample
# 
# Author: demel
###############################################################################



#' Function to create a data frame to be used for the GLM
#' 
#' @param counts matrix with counts for each spike (row) and each sample
#' (column)
#' @param spikeins names of spike-ins, in same order as rownames of counts
#' @param spikein.lengths length of spike-ins
#' @param spikein.labeling labeling status of spike-ins (vector consisting of
#' "L" and "U")
#' @param samples individual name for each sample, e.g. colnames of count table
#' @param conditions.labeling labeling status of sample (vector consisting of
#' "L" and "T")
#' @param debug should debugging modus be used?
#' @return dataframe containing all relevant information for the GLM
#' 
#' @author Carina Demel
#' @export 
spikein.dataframe <- function(
		counts,
		spikeins = rownames(counts),
		spikein.lengths,
		spikein.labeling,
		samples = colnames(counts),
		conditions.labeling,
		debug = FALSE
){
	
	if (debug)
		browser()
	
	counts.vec <- as.vector(counts)
	mat <- data.frame(spike = rep(spikeins, length(samples)), 
			length = rep(spikein.lengths[spikeins], length(samples)),
			spike.labeled = rep(spikein.labeling[spikeins], length(samples)),
			sample = rep(paste(samples), each = length(spikeins)),
			sample.labeling = rep(conditions.labeling, each = length(spikeins)),
			counts = counts.vec)
	
	# additional columns: control for crosscontamination, with one value per
	# labeled sample and one value for ALL total samples (e.g.FALSE)
	# and log.length: natural logarithm of spike-in length
	mat$control.for.cross.contamination <- 
			ifelse(mat$sample.labeling == "L" & mat$spike.labeled == "U", "T", "F")
	mat$control.for.cross.contamination <- 
			factor(mat$control.for.cross.contamination)
	mat$sample.labeling <- factor(mat$sample.labeling)
	mat$ccc <- paste("L", rep(1:length(samples), each = length(spikeins)), 
			collape = " ")
	mat$ccc[mat$control.for.cross.contamination == "F"] <- "F"
	mat$ccc <- factor(mat$ccc)
	mat$log.length <- log(mat$length) #use natural logarithm
	
	return(mat)
}
carinademel/RNAlife documentation built on May 13, 2019, 12:43 p.m.