#' @rdname ICASDataSet
#'
#' @description The ICASDataSet object is the center of Intron Centric Alternative Splicing (ICAS)
#' analysis. It stores all information associated with the dataset, including raw data, PSI,
#' annotations, analyes, etc. All that is needed to construct a ICASDataSet object is an Splice
#' Junction (SJ) expression matrix (rows are SJ, columns are cells), and GTF file which have been
#' used in STAR (which we strongly recommend) alignment.
#'
#' @import methods
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom utils packageVersion
#' @importFrom S4Vectors SimpleList
#' @importFrom SummarizedExperiment assayNames
#' @importFrom SummarizedExperiment assay
#' @importFrom SummarizedExperiment colData
#' @importFrom S4Vectors mcols
#' @importFrom SummarizedExperiment mcols
#' @importFrom S4Vectors metadata
#' @exportClass ICASDataSet
#'
#' @slot counts The raw SJ counts are stored in \code{RangedSummarizedExperiment} Container.
#' @slot psi The PSI of same start of all SJs are stored in \code{RangedSummarizedExperiment} Container.
#' @slot rowRanges The \code{GRanges} format information of all SJs are stored in \code{RangedSummarizedExperiment} Container.
#' @slot colData The sample information are stored in \code{RangedSummarizedExperiment} Container.
#' @slot design A \code{formula} about your experimental design.
#' @slot HostGene The host gene of all SJs. For novel (unannotated in GTF) SJs we use range overlap
#' method and we give all the covered genes and the genes anchored by the cleavage site at the same time.
#' @slot ASType The AS type of each ASSJ
setClass("ICASDataSet",
contains = "RangedSummarizedExperiment",
slots = representation(
psi = "matrix",
design = "character",
HostGene = "ANY",
ASType = "ANY",
project.name = "character"
))
setValidity("ICASDataSet", function(object) {
if (! ("counts" %in% SummarizedExperiment::assayNames(object)) )
stop("the assays slot must contain a matrix named 'counts'" )
if ( !is.numeric( counts(object) ) )
stop("the count data is not numeric" )
if ( any( is.na( counts(object) ) ) )
stop("NA values are not allowed in the count matrix" )
if ( !is.integer( counts(object) ) )
stop("the count data is not in integer mode" )
if ( any( counts(object) < 0 ) )
stop("the count data contains negative values" )
# 'design' is either a formula or matrix
design <- object@design
stopifnot(length(object@design) == 1)
if (!design %in% names(SummarizedExperiment::colData(object))) {
stop("the variable of design must be a column in colData")
}
if(class(SummarizedExperiment::colData(object)[[design]]) != "factor") {
stop("the design variable are character vectors. convert these columns of colData(object)
to factors before including in the design")
}
factor.lvls <- levels(SummarizedExperiment::colData(object)[[design]])
if(any(!grepl("^[A-Za-z0-9_.]+$", factor.lvls))) {
TRUE
warning("Note: levels of factors in the design contain characters other than
letters, numbers, '_' and '.'. It is required to use only letters, numbers,
and delimiters '_' or '.', as these are safe characters for column names in R.")
}
TRUE
})
#' ICASDataSet object and constructors
#'
#' \code{ICASDataSet} is a subclass of \code{RangedSummarizedExperiment},
#' used to store the input values, intermediate calculations and results of an
#' analysis of differential expression. The \code{ICASDataSet} class
#' enforces non-negative integer values in the "counts" matrix stored as
#' the first element in the assay list.
#' In addition, a formula which specifies the design of the experiment must be provided.
#' The constructor functions create a ICASDataSet object from various types of input:
#' a RangedSummarizedExperiment, a matrix of splice junctions count, or a list of STAR output
#' splice junction files.
#'
#' See the vignette for examples of construction from different types.
#'
#' @param se a \code{RangedSummarizedExperiment} with columns of variables
#' indicating sample information in \code{colData}, and the counts as the first element in the
#' assays list, which will be renamed "counts". A \code{RangedSummarizedExperiment} object can
#' be generated by the function \code{summarizeOverlaps} in the GenomicAlignments package.
#' @param design a \code{character}.
#' the \code{character} indicate how to group the counts for each gene
#' depend on the variables in \code{colData}.
#' @param countData for matrix input: a matrix of non-negative integers
#' @param colData for matrix input: a \code{DataFrame} or \code{data.frame} with at least a single column.
#' Rows of colData correspond to columns of countData
#' @param ... arguments provided to \code{SummarizedExperiment} including rowRanges and metadata. Note that
#' for Bioconductor 3.1, rowRanges must be a GRanges or GRangesList, with potential metadata columns
#' as a DataFrame accessed and stored with \code{mcols}. If a user wants to store metadata columns
#' about the rows of the countData, but does not have GRanges or GRangesList information,
#' first construct the ICASDataSet without rowRanges and then add the DataFrame with \code{mcols(dds)}.
#'
#' @return A ICASDataSet object.
#'
#' @aliases ICASDataSet ICASDataSet-class ICASDataSetFromMatrix ICASDataSetFromSJFile
#'
#' @references See \url{https://github.com/alexdobin/STAR} for STAR
#'
#' @docType class
#'
#' @examples
#'
#' countData <- matrix(1:100,ncol=4)
#' condition <- factor(c("A","A","B","B"))
#' dds <- ICASDataSetFromMatrix(countData, DataFrame(condition), condition)
#'
#' @rdname ICASDataSet
#' @import methods
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom utils packageVersion
#' @importClassesFrom SummarizedExperiment RangedSummarizedExperiment
#' @export
ICASDataSet <- function(se, design) {
if (!is(se, "RangedSummarizedExperiment")) {
if (is(se, "SummarizedExperiment")) {
se <- as(se, "RangedSummarizedExperiment")
} else {
stop("'se' must be a RangedSummarizedExperiment object")
}
}
if (is.null(SummarizedExperiment::assayNames(se)) || SummarizedExperiment::assayNames(se)[1] != "counts") {
message("renaming the first element in assays to 'counts'")
SummarizedExperiment::assayNames(se)[1] <- "counts"
}
# before validity check, try to convert assay to integer mode
if (any(is.na(SummarizedExperiment::assay(se))))
stop("NA values are not allowed in the count matrix")
if (any(SummarizedExperiment::assay(se) < 0)) {
stop("some values in assay are negative")
}
if (!is.integer(SummarizedExperiment::assay(se))) {
if (!is.numeric(SummarizedExperiment::assay(se))) {
stop(paste("counts matrix should be numeric, currently it has mode:", mode(SummarizedExperiment::assay(se))))
}
if (any(round(SummarizedExperiment::assay(se)) != SummarizedExperiment::assay(se))) {
stop("some values in assay are not integers")
}
message("converting counts to integer mode")
mode(SummarizedExperiment::assay(se)) <- "integer"
}
if (all(SummarizedExperiment::assay(se) == 0)) {
stop("all samples have 0 counts for all genes. check the counting script.")
}
if (all(rowSums(SummarizedExperiment::assay(se) == SummarizedExperiment::assay(se)[, 1]) == ncol(se))) {
warning("all genes have equal values for all samples. will not be able to perform differential analysis")
}
if (any(duplicated(rownames(se)))) {
warning(sum(duplicated(rownames(se)))," duplicate rownames were renamed by adding numbers")
rnms <- rownames(se)
dups <- unique(rnms[duplicated(rnms)])
for (rn in dups) {
idx <- which(rnms == rn)
rnms[idx[-1]] <- paste(rnms[idx[-1]], c(seq_len(length(idx) - 1)), sep=".")
}
rownames(se) <- rnms
}
if(length(design) != 1) {
stop("variable of design must be one element.")
}
if(!design %in% names(SummarizedExperiment::colData(se))) {
stop("the variable of design must be a column in colData")
}
if (anyNA(SummarizedExperiment::colData(se)[[design]])) {
stop(paste("variable of design cannot contain NA:", design))
}
if(class(SummarizedExperiment::colData(se)[[design]]) != "factor") {
stop("the design variable are character vectors. convert these columns of colData(object)
to factors before including in the design")
}
if(any(table(colData(se)[[design]]) == 0)) {
message("factor levels were dropped which had no samples")
colData(se)[[design]] <- droplevels(colData(se)[[design]])
}
if(all(colData(se)[[design]] == colData(se)[[design]][1])) {
warning("design variable with all samples having the same value,
remove these variables from the design")
}
# Add columns on the columns
mcolsCols <- DataFrame(type = rep("input", ncol(SummarizedExperiment::colData(se))),
description = rep("", ncol(SummarizedExperiment::colData(se))))
S4Vectors::mcols(SummarizedExperiment::colData(se)) <- if (is.null(S4Vectors::mcols(SummarizedExperiment::colData(se)))) {
mcolsCols
} else if (all(names(S4Vectors::mcols(SummarizedExperiment::colData(se))) == c("type", "description"))) {
mcolsCols
} else {
cbind(S4Vectors::mcols(SummarizedExperiment::colData(se)), mcolsCols)
}
object <- new("ICASDataSet", se, design = design)
# now we know we have at least an empty GRanges or GRangesList for rowRanges
# so we can create a metadata column 'type' for the mcols
# and we label any incoming columns as 'input'
# this is metadata columns on the rows
mcolsRows <- DataFrame(type = rep("input", ncol(SummarizedExperiment::mcols(object))),
description = rep("", ncol(SummarizedExperiment::mcols(object))))
S4Vectors::mcols(SummarizedExperiment::mcols(object)) <- if (is.null(S4Vectors::mcols(SummarizedExperiment::mcols(object)))) {
mcolsRows
} else if (all(names(S4Vectors::mcols(SummarizedExperiment::mcols(object))) == c("type", "description"))) {
mcolsRows
} else {
cbind(S4Vectors::mcols(SummarizedExperiment::mcols(object)), mcolsRows)
}
# stash the package version
S4Vectors::metadata(object)[["version"]] <- packageVersion("ICAS")
validObject(object)
return(object)
}
#' @rdname ICASDataSet
#' @export
ICASDataSetFromMatrix <- function(countData, colData, design, ... ) {
# check that these agree in number
stopifnot(ncol(countData) == nrow(colData))
# we expect a matrix of counts, which are non-negative integers
countData <- as.matrix(countData)
if (is(colData, "data.frame"))
colData <- as(colData, "DataFrame")
if (!design %in% names(colData)) {
stop("the variable of design must be a column in colData")
}
if(class(colData[[design]]) != "factor") {
stop("the design variable are character vectors. convert these columns of colData
to factors before including in the design")
}
# check if the rownames of colData are simply in different order
# than the colnames of the countData, if so throw an error
# as the user probably should investigate what's wrong
if (!is.null(rownames(colData)) & !is.null(colnames(countData))) {
if (all(sort(rownames(colData)) == sort(colnames(countData)))) {
if (!all(rownames(colData) == colnames(countData))) {
stop(paste("rownames of the colData:",
paste(rownames(colData), collapse = ","),
"are not in the same order as the colnames of the countData:",
paste(colnames(countData), collapse = ",")))
}
}
}
if (is.null(rownames(colData)) & !is.null(colnames(countData))) {
rownames(colData) <- colnames(countData)
}
se <- SummarizedExperiment(assays = S4Vectors::SimpleList(counts = countData), colData = colData, ...)
object <- ICASDataSet(se, design = design)
SummarizedExperiment::colData(object)$nSJs <- colSums(counts(object) > 0)
SummarizedExperiment::colData(object)$nSJReads <- colSums(counts(object))
# stash the package version
S4Vectors::metadata(object)[["version"]] <- packageVersion("ICAS")
validObject(object)
return(object)
}
#' @rdname ICASDataSet
#' @export
#' @param SJFiles A character vector containing the names of the STAR outputed **SJ.out.tab**
#' files in the specified directories.
#' @param postfix The postfix of STAR outputed **SJ.out.tab**, [default is SJ.out.tab]
#' @param uniqMapOnly a logical value indicating whether only the uniquely mapping reads crossing
#' the junction to be used
#' @param annotationOnly a logical value indicating whether only the annotated reads crossing
#' the junction to be used
#' @param minSJRead the minimum number of reads crossing each junction of each sample
#' @param minOverhang the minimum length of maximum spliced alignment overhang of each junction of
#' each sample
#' @param minSJRowSum the minimum number of reads crossing each junction of all samples
#'
ICASDataSetFromSJFile <- function(SJFiles,
postfix = "SJ.out.tab",
uniqMapOnly = TRUE,
annotationOnly = FALSE,
minSJRead = 2,
minOverhang = 3,
minSJRowSum = 100,
colData,
design, ... ) {
# check that these agree in number
stopifnot(length(SJFiles) == nrow(colData))
if (is(colData, "data.frame"))
colData <- as(colData, "DataFrame")
if (!design %in% names(colData)) {
stop("the variable of design must be a column in colData")
}
if(class(colData[[design]]) != "factor") {
stop("the design variable are character vectors. convert these columns of colData
to factors before including in the design")
}
# check if the rownames of colData are simply in different order
# than the colnames of the countData, if so throw an error
# as the user probably should investigate what's wrong
if (!is.null(rownames(colData)) & !is.null(gsub(postfix, "", basename(SJFiles)))) {
if (all(sort(rownames(colData)) == sort(gsub(postfix, "", basename(SJFiles))))) {
if (!all(rownames(colData) == gsub(postfix, "", basename(SJFiles)))) {
stop(paste("rownames of the colData:",
paste(rownames(colData), collapse = ","),
"are not in the same order as the prefix of the SJFiles:",
paste(gsub(postfix, "", basename(SJFiles)), collapse = ",")))
}
}
}
countData <- multiMerge(SJFiles = SJFiles,
uniqMapOnly = uniqMapOnly,
minSJRead = minSJRead,
minSJRowSum = minSJRowSum,
annotationOnly = annotationOnly,
minOverhang = minOverhang,
postfix = postfix)
# we expect a matrix of counts, which are non-negative integers
countData <- as.matrix(countData)
# check if the rownames of colData are simply in different order
# than the colnames of the countData, if so throw an error
# as the user probably should investigate what's wrong
if (!is.null(rownames(colData)) & !is.null(colnames(countData))) {
if (all(sort(rownames(colData)) == sort(colnames(countData)))) {
if (!all(rownames(colData) == colnames(countData))) {
stop(paste("rownames of the colData:",
paste(rownames(colData), collapse = ","),
"are not in the same order as the colnames of the countData:",
paste(colnames(countData), collapse = ",")))
}
}
}
se <- SummarizedExperiment(assays = S4Vectors::SimpleList(counts = countData), colData = colData, ...)
object <- ICASDataSet(se, design = design)
SummarizedExperiment::colData(object)$nSJs <- colSums(counts(object) > 0)
SummarizedExperiment::colData(object)$nSJReads <- colSums(counts(object))
# stash the package version
S4Vectors::metadata(object)[["version"]] <- packageVersion("ICAS")
validObject(object)
return(object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.