Nothing
#' @rdname DESeqDataSet
#' @export
setClass("DESeqDataSet",
contains = "RangedSummarizedExperiment",
representation = representation(
design = "ANY",
dispersionFunction = "function"))
setValidity("DESeqDataSet", function(object) {
if (! ("counts" %in% assayNames(object)) )
return( "the assays slot must contain a matrix named 'counts'" )
if ( !is.numeric( counts(object) ) )
return( "the count data is not numeric" )
if ( any( is.na( counts(object) ) ) )
return( "NA values are not allowed in the count matrix" )
if ( !is.integer( counts(object) ) )
return( "the count data is not in integer mode" )
if ( any( counts(object) < 0 ) )
return( "the count data contains negative values" )
design <- design(object)
# 'design' is either a formula or matrix
stopifnot(is(design, "formula") | is(design, "matrix"))
if (is(design, "formula")) {
designVars <- all.vars(design)
if (!all(designVars %in% names(colData(object)))) {
return("all variables in design formula must be columns in colData")
}
designVarsClass <- sapply(designVars, function(v) class(colData(object)[[v]]))
if (any(designVarsClass == "character")) {
return("variables in design formula are character vectors.
convert these columns of colData(object) to factors before including in the design formula")
}
designFactors <- designVars[designVarsClass == "factor"]
# levels would duplicate after make.names()
if (any(sapply(designFactors,function(v) {
factor.lvls <- levels(colData(object)[[v]])
factor.nms <- make.names(factor.lvls)
any(duplicated(factor.nms))
}))) {
return("levels of factors in the design have non-unique level names after make.names() is applied.
best to only use letters and numbers for levels of factors in the design")
}
# levels contain characters other than letters, numbers, and underscore
if (any(sapply(designFactors,function(v) {
factor.lvls <- levels(colData(object)[[v]])
any(!grepl("^[A-Za-z0-9_.]+$",factor.lvls))
}))) {
# just a warning for now
message(" Note: levels of factors in the design contain characters other than
letters, numbers, '_' and '.'. It is recommended (but not required) to use
only letters, numbers, and delimiters '_' or '.', as these are safe characters
for column names in R. [This is a message, not a warning or an error]")
}
} else if (is(design, "matrix")) {
# TODO add some more tests for if 'design' is matrix
stopifnot(nrow(design) == ncol(object))
}
TRUE
})
#' DESeqDataSet object and constructors
#'
#' \code{DESeqDataSet} is a subclass of \code{RangedSummarizedExperiment},
#' used to store the input values, intermediate calculations and results of an
#' analysis of differential expression. The \code{DESeqDataSet} class
#' enforces non-negative integer values in the "counts" matrix stored as
#' the first element in the assay list.
#' In addition, a formula which specifies the design of the experiment must be provided.
#' The constructor functions create a DESeqDataSet object
#' from various types of input:
#' a RangedSummarizedExperiment, a matrix, count files generated by
#' the python package HTSeq, or a list from the tximport function in the
#' tximport package.
#' See the vignette for examples of construction from different types.
#'
#' Note on the error message "assay colnames() must be NULL or equal colData rownames()":
#' this means that the colnames of countData are different than the rownames of colData.
#' Fix this with: \code{colnames(countData) <- NULL}
#'
#' @param se a \code{RangedSummarizedExperiment} with columns of variables
#' indicating sample information in \code{colData},
#' and the counts as the first element in the assays list, which will
#' be renamed "counts". A \code{RangedSummarizedExperiment} object can be
#' generated by the function \code{summarizeOverlaps} in the GenomicAlignments
#' package.
#' @param design a \code{formula} or \code{matrix}.
#' the \code{formula} expresses how the counts for each gene
#' depend on the variables in \code{colData}. Many R \code{formula} are valid,
#' including designs with multiple variables, e.g., \code{~ group + condition},
#' and designs with interactions, e.g., \code{~ genotype + treatment + genotype:treatment}.
#' See \code{\link{results}} for a variety of designs and how to extract results tables.
#' By default, the functions in this package will use
#' the last variable in the formula for building results tables and plotting.
#' \code{~ 1} can be used for no design, although users need to remember
#' to switch to another design for differential testing.
#' @param countData for matrix input: a matrix of non-negative integers
#' @param colData for matrix input: a \code{DataFrame} or \code{data.frame} with at least a single column.
#' Rows of colData correspond to columns of countData
#' @param tidy for matrix input: whether the first column of countData is the rownames for the count matrix
#' @param sampleTable for htseq-count: a \code{data.frame} with three or more columns. Each row
#' describes one sample. The first column is the sample name, the second column
#' the file name of the count file generated by htseq-count, and the remaining
#' columns are sample metadata which will be stored in \code{colData}
#' @param txi for tximport: the simple list output of the \code{tximport} function
#' @param directory for htseq-count: the directory relative to which the filenames are specified. defaults to current directory
#' @param ignoreRank use of this argument is reserved for DEXSeq developers only.
#' Users will immediately encounter an error upon trying to estimate dispersion
#' using a design with a model matrix which is not full rank.
#' @param ... arguments provided to \code{SummarizedExperiment} including rowRanges and metadata. Note that
#' for Bioconductor 3.1, rowRanges must be a GRanges or GRangesList, with potential metadata columns
#' as a DataFrame accessed and stored with \code{mcols}. If a user wants to store metadata columns
#' about the rows of the countData, but does not have GRanges or GRangesList information,
#' first construct the DESeqDataSet without rowRanges and then add the DataFrame with \code{mcols(dds)}.
#'
#' @return A DESeqDataSet object.
#'
#' @aliases DESeqDataSet DESeqDataSet-class DESeqDataSetFromMatrix DESeqDataSetFromHTSeqCount
#'
#' @references See \url{http://www-huber.embl.de/users/anders/HTSeq} for htseq-count
#'
#' @docType class
#'
#' @examples
#'
#' countData <- matrix(1:100,ncol=4)
#' condition <- factor(c("A","A","B","B"))
#' dds <- DESeqDataSetFromMatrix(countData, DataFrame(condition), ~ condition)
#'
#' @rdname DESeqDataSet
#' @importFrom utils packageVersion
#' @export
DESeqDataSet <- function(se, design, ignoreRank=FALSE) {
if (!is(se, "RangedSummarizedExperiment")) {
if (is(se, "SummarizedExperiment")) {
se <- as(se, "RangedSummarizedExperiment")
} else {
stop("'se' must be a RangedSummarizedExperiment object")
}
}
if (is.null(assayNames(se)) || assayNames(se)[1] != "counts") {
message("renaming the first element in assays to 'counts'")
assayNames(se)[1] <- "counts"
}
# special tximeta processing
if ("tximetaInfo" %in% names(metadata(se))) {
se <- processTximeta(se)
}
# before validity check, try to convert assay to integer mode
if (any(is.na(assay(se))))
stop("NA values are not allowed in the count matrix")
if (any(assay(se) < 0)) {
stop("some values in assay are negative")
}
if (!is.integer(assay(se))) {
if (!is.numeric(assay(se))) {
stop(paste("counts matrix should be numeric, currently it has mode:", mode(assay(se))))
}
if (any(round(assay(se)) != assay(se))) {
stop("some values in assay are not integers")
}
message("converting counts to integer mode")
mode(assay(se)) <- "integer"
}
if (all(assay(se) == 0)) {
stop("all samples have 0 counts for all genes. check the counting script.")
}
if (all(rowSums(assay(se) == assay(se)[,1]) == ncol(se))) {
warning("all genes have equal values for all samples. will not be able to perform differential analysis")
}
if (any(duplicated(rownames(se)))) {
warning(sum(duplicated(rownames(se)))," duplicate rownames were renamed by adding numbers")
rnms <- rownames(se)
dups <- unique(rnms[duplicated(rnms)])
for (rn in dups) {
idx <- which(rnms == rn)
rnms[idx[-1]] <- paste(rnms[idx[-1]], c(seq_len(length(idx) - 1)), sep=".")
}
rownames(se) <- rnms
}
if (is(design, "formula")) {
designVars <- all.vars(design)
if (!all(designVars %in% names(colData(se)))) {
stop("all variables in design formula must be columns in colData")
}
for (v in designVars) {
if (any(is.na(colData(se)[[v]])))
stop(paste("variables in design formula cannot contain NA:", v))
}
designVarsClass <- sapply(designVars, function(v) class(colData(se)[[v]])[1])
if (any(designVarsClass == "character")) {
warning("some variables in design formula are characters, converting to factors")
for (v in designVars[designVarsClass == "character"]) {
colData(se)[[v]] <- factor(colData(se)[[v]])
}
}
if (length(designVars) == 1) {
var <- colData(se)[[designVars]]
if (all(var == var[1])) {
stop("design has a single variable, with all samples having the same value.
use instead a design of '~ 1'. estimateSizeFactors, rlog and the VST can then be used")
}
}
designVarsNumeric <- sapply(designVars, function(v) is.numeric(colData(se)[[v]]))
if (any(designVarsNumeric)) {
msgIntVars <- FALSE
msgCenterScale <- FALSE
for (v in designVars[designVarsNumeric]) {
if (all(colData(se)[[v]] == round(colData(se)[[v]]))) {
msgIntVars <- TRUE
}
if (mean(colData(se)[[v]]) > 5 | sd(colData(se)[[v]]) > 5) {
msgCenterScale <- TRUE
}
}
if (msgIntVars) {
message(" the design formula contains one or more numeric variables with integer values,
specifying a model with increasing fold change for higher values.
did you mean for this to be a factor? if so, first convert
this variable to a factor using the factor() function")
}
if (msgCenterScale) {
message(" the design formula contains one or more numeric variables that have mean or
standard deviation larger than 5 (an arbitrary threshold to trigger this message).
it is generally a good idea to center and scale numeric variables in the design
to improve GLM convergence.")
}
}
if (any(designVarsClass == "ordered")) {
stop("the design formula contains an ordered factor. The internal steps
do not work on ordered factors as a formula. Instead you should provide a matrix to
the 'design' slot or to the 'full' argument of DESeq(), constructed using model.matrix.")
}
designFactors <- designVars[designVarsClass == "factor"]
missingLevels <- sapply(designFactors, function(v) any(table(colData(se)[[v]]) == 0))
if (any(missingLevels)) {
message("factor levels were dropped which had no samples")
for (v in designFactors[missingLevels]) {
colData(se)[[v]] <- droplevels(colData(se)[[v]])
}
}
singleLevel <- sapply(designFactors, function(v) all(colData(se)[[v]] == colData(se)[[v]][1]))
if (any(singleLevel)) {
stop("design contains one or more variables with all samples having the same value,
remove these variables from the design")
}
# if the last variable in the design formula is a
# factor, and has a level 'control', check if it is
# the reference level and if not print a message
lastDV <- length(designVars)
if (length(designVars) > 0 && designVarsClass[lastDV] == "factor") {
lastDVLvls <- levels(colData(se)[[designVars[lastDV]]])
controlSynonyms <- c("control","Control","CONTROL")
for (cSyn in controlSynonyms) {
if (cSyn %in% lastDVLvls) {
if (cSyn != lastDVLvls[1]) {
message(paste0(" it appears that the last variable in the design formula, '",designVars[lastDV],"',
has a factor level, '",cSyn,"', which is not the reference level. we recommend
to use factor(...,levels=...) or relevel() to set this as the reference level
before proceeding. for more information, please see the 'Note on factor levels'
in vignette('DESeq2')."))
}
}
}
}
modelMatrix <- stats::model.matrix.default(design, data=as.data.frame(colData(se)))
} else if (is(design, "matrix")) {
modelMatrix <- design
} else {
stop("'design' should be a formula or a matrix")
}
if (!ignoreRank) {
checkFullRank(modelMatrix)
}
# Add columns on the columns
mcolsCols <- DataFrame(type=rep("input",ncol(colData(se))),
description=rep("",ncol(colData(se))))
mcols(colData(se)) <- if (is.null(mcols(colData(se)))) {
mcolsCols
} else if (all(names(mcols(colData(se))) == c("type","description"))) {
mcolsCols
} else {
cbind(mcols(colData(se)), mcolsCols)
}
object <- new("DESeqDataSet", se, design = design)
# now we know we have at least an empty GRanges or GRangesList for rowRanges
# so we can create a metadata column 'type' for the mcols
# and we label any incoming columns as 'input'
# this is metadata columns on the rows
mcolsRows <- DataFrame(type=rep("input",ncol(mcols(object))),
description=rep("",ncol(mcols(object))))
mcols(mcols(object)) <- if (is.null(mcols(mcols(object)))) {
mcolsRows
} else if (all(names(mcols(mcols(object))) == c("type","description"))) {
mcolsRows
} else {
cbind(mcols(mcols(object)), mcolsRows)
}
# stash the package version
metadata(object)[["version"]] <- packageVersion("DESeq2")
return(object)
}
#' @rdname DESeqDataSet
#' @export
DESeqDataSetFromMatrix <- function( countData, colData, design, tidy=FALSE, ignoreRank=FALSE, ... )
{
if (tidy) {
stopifnot(ncol(countData) > 1)
rownms <- as.character(countData[,1])
countData <- countData[,-1,drop=FALSE]
rownames(countData) <- rownms
}
# check that these agree in number
stopifnot(ncol(countData) == nrow(colData))
if (is(countData, "data.frame")) {
if (any(sapply(countData, is, "factor"))) {
warning("\n\n 'countData' is a data.frame with one or more factor columns.
Be aware that converting directly to numeric can lead to errors.
Provide matrices of integers to avoid this error.")
}
}
# we expect a matrix of counts, which are non-negative integers
countData <- as.matrix( countData )
if (is(colData,"data.frame"))
colData <- as(colData, "DataFrame")
# check if the rownames of colData are simply in different order
# than the colnames of the countData, if so throw an error
# as the user probably should investigate what's wrong
if (!is.null(rownames(colData)) & !is.null(colnames(countData))) {
if (all(sort(rownames(colData)) == sort(colnames(countData)))) {
if (!all(rownames(colData) == colnames(countData))) {
stop(paste("rownames of the colData:
",paste(rownames(colData),collapse=","),"
are not in the same order as the colnames of the countData:
",paste(colnames(countData),collapse=",")))
}
}
}
if (is.null(rownames(colData)) & !is.null(colnames(countData))) {
rownames(colData) <- colnames(countData)
}
se <- SummarizedExperiment(assays = SimpleList(counts=countData), colData = colData, ...)
object <- DESeqDataSet(se, design = design, ignoreRank)
return(object)
}
#' @rdname DESeqDataSet
#' @export
DESeqDataSetFromHTSeqCount <- function( sampleTable, directory=".", design, ignoreRank=FALSE, ...)
{
if (missing(design)) stop("design is missing")
l <- lapply( as.character( sampleTable[,2] ), function(fn) read.table( file.path( directory, fn ), fill=TRUE ) )
if( ! all( sapply( l, function(a) all( a$V1 == l[[1]]$V1 ) ) ) )
stop( "Gene IDs (first column) differ between files." )
# select last column of 'a', works even if htseq was run with '--additional-attr'
tbl <- sapply( l, function(a) a[,ncol(a)] )
colnames(tbl) <- sampleTable[,1]
rownames(tbl) <- l[[1]]$V1
rownames(sampleTable) <- sampleTable[,1]
oldSpecialNames <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
# either starts with two underscores
# or is one of the old special names (htseq-count backward compatability)
specialRows <- (substr(rownames(tbl),1,1) == "_") | rownames(tbl) %in% oldSpecialNames
tbl <- tbl[ !specialRows, , drop=FALSE ]
object <- DESeqDataSetFromMatrix(countData=tbl,colData=sampleTable[,-(1:2),drop=FALSE],design=design,ignoreRank, ...)
return(object)
}
#' @rdname DESeqDataSet
#' @export
DESeqDataSetFromTximport <- function(txi, colData, design, ...)
{
stopifnot(is(txi, "list"))
counts <- round(txi$counts)
mode(counts) <- "integer"
object <- DESeqDataSetFromMatrix(countData=counts, colData=colData, design=design, ...)
stopifnot(txi$countsFromAbundance %in% c("no","scaledTPM","lengthScaledTPM"))
if (txi$countsFromAbundance %in% c("scaledTPM","lengthScaledTPM")) {
message("using just counts from tximport")
} else {
message("using counts and average transcript lengths from tximport")
lengths <- txi$length
stopifnot(all(lengths > 0))
dimnames(lengths) <- dimnames(object)
assays(object)[["avgTxLength"]] <- lengths
}
return(object)
}
#' @rdname DESeqResults
#' @export
setClass("DESeqResults",
contains="DFrame",
representation = representation(
priorInfo = "list")
)
#' DESeqResults object and constructor
#'
#' This constructor function would not typically be used by "end users".
#' This simple class indirectly extends the DataFrame class defined in the
#' S4Vectors package to allow other packages to write methods for results
#' objects from the DESeq2 package. It is used by \code{\link{results}}
#' to wrap up the results table.
#'
#' @param DataFrame a DataFrame of results, standard column names are:
#' baseMean, log2FoldChange, lfcSE, stat, pvalue, padj.
#' @param priorInfo a list giving information on the log fold change prior
#'
#' @return a DESeqResults object
#' @docType class
#' @aliases DESeqResults-class
#' @rdname DESeqResults
#' @export
DESeqResults <- function(DataFrame, priorInfo=list()) {
new("DESeqResults", DataFrame, priorInfo=priorInfo)
}
#' @rdname DESeqTransform
#' @export
setClass("DESeqTransform", contains="RangedSummarizedExperiment")
#' DESeqTransform object and constructor
#'
#' This constructor function would not typically be used by "end users".
#' This simple class extends the RangedSummarizedExperiment class of the
#' SummarizedExperiment package.
#' It is used by \code{\link{rlog}} and
#' \code{\link{varianceStabilizingTransformation}}
#' to wrap up the results into a class for downstream methods,
#' such as \code{\link{plotPCA}}.
#'
#' @param SummarizedExperiment a RangedSummarizedExperiment
#'
#' @return a DESeqTransform object
#' @docType class
#' @aliases DESeqTransform-class
#' @rdname DESeqTransform
#' @export
DESeqTransform <- function(SummarizedExperiment) {
se <- SummarizedExperiment
if (!is(se, "RangedSummarizedExperiment")) {
if (is(se, "SummarizedExperiment")) {
se <- as(se, "RangedSummarizedExperiment")
} else {
stop("'SummarizedExperiment' must be a RangedSummarizedExperiment object")
}
}
new("DESeqTransform", se)
}
# unexported helper function
processTximeta <- function(se) {
assay(se) <- round(assay(se))
mode(assay(se)) <- "integer"
stopifnot("countsFromAbundance" %in% names(metadata(se)))
cfa <- metadata(se)$countsFromAbundance
stopifnot(cfa %in% c("no","scaledTPM","lengthScaledTPM"))
if (cfa %in% c("scaledTPM","lengthScaledTPM")) {
message("using just counts from tximeta")
} else {
message("using counts and average transcript lengths from tximeta")
stopifnot(all(assays(se)$length > 0))
anms <- assayNames(se)
assayNames(se)[anms == "length"] <- "avgTxLength"
}
return(se)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.