R/AllClasses.R

Defines functions ICASDataSetFromSJFile ICASDataSetFromMatrix ICASDataSet

Documented in ICASDataSet ICASDataSetFromMatrix ICASDataSetFromSJFile

#' @rdname ICASDataSet
#'
#' @description The ICASDataSet object is the center of Intron Centric Alternative Splicing (ICAS)
#' analysis. It stores all information associated with the dataset, including raw data, PSI,
#' annotations, analyes, etc. All that is needed to construct a ICASDataSet object is an Splice
#' Junction (SJ) expression matrix (rows are SJ, columns are cells), and GTF file which have been
#' used in STAR (which we strongly recommend) alignment.
#'
#' @import methods
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom utils packageVersion
#' @importFrom S4Vectors SimpleList
#' @importFrom SummarizedExperiment assayNames
#' @importFrom SummarizedExperiment assay
#' @importFrom SummarizedExperiment colData
#' @importFrom S4Vectors mcols
#' @importFrom SummarizedExperiment mcols
#' @importFrom S4Vectors metadata
#' @exportClass ICASDataSet
#'
#' @slot counts The raw SJ counts are stored in \code{RangedSummarizedExperiment} Container.
#' @slot psi The PSI of same start of all SJs are stored in \code{RangedSummarizedExperiment} Container.
#' @slot rowRanges The \code{GRanges} format information of all SJs are stored in \code{RangedSummarizedExperiment} Container.
#' @slot colData The sample information are stored in \code{RangedSummarizedExperiment} Container.
#' @slot design A \code{formula} about your experimental design.
#' @slot HostGene The host gene of all SJs. For novel (unannotated in GTF) SJs we use range overlap
#' method and we give all the covered genes and the genes anchored by the cleavage site at the same time.
#' @slot ASType The AS type of each ASSJ

setClass("ICASDataSet",
         contains = "RangedSummarizedExperiment",
         slots = representation(
           psi = "matrix",
           design = "character",
           HostGene = "ANY",
           ASType = "ANY",
           project.name = "character"
         ))

setValidity("ICASDataSet", function(object) {
  if (! ("counts" %in% SummarizedExperiment::assayNames(object)) )
    stop("the assays slot must contain a matrix named 'counts'" )
  if ( !is.numeric( counts(object) ) )
    stop("the count data is not numeric" )
  if ( any( is.na( counts(object) ) ) )
    stop("NA values are not allowed in the count matrix" )
  if ( !is.integer( counts(object) ) )
    stop("the count data is not in integer mode" )
  if ( any( counts(object) < 0 ) )
    stop("the count data contains negative values" )

  # 'design' is either a formula or matrix
  design <- object@design
  stopifnot(length(object@design) == 1)

  if (!design %in% names(SummarizedExperiment::colData(object))) {
    stop("the variable of design must be a column in colData")
  }

  if(class(SummarizedExperiment::colData(object)[[design]]) != "factor") {
    stop("the design variable are character vectors. convert these columns of colData(object)
           to factors before including in the design")
  }

  factor.lvls <- levels(SummarizedExperiment::colData(object)[[design]])

  if(any(!grepl("^[A-Za-z0-9_.]+$", factor.lvls))) {
    TRUE
    warning("Note: levels of factors in the design contain characters other than
           letters, numbers, '_' and '.'. It is required to use only letters, numbers,
           and delimiters '_' or '.', as these are safe characters for column names in R.")
  }

  TRUE
  })


#' ICASDataSet object and constructors
#'
#' \code{ICASDataSet} is a subclass of \code{RangedSummarizedExperiment},
#' used to store the input values, intermediate calculations and results of an
#' analysis of differential expression.  The \code{ICASDataSet} class
#' enforces non-negative integer values in the "counts" matrix stored as
#' the first element in the assay list.
#' In addition, a formula which specifies the design of the experiment must be provided.
#' The constructor functions create a ICASDataSet object from various types of input:
#' a RangedSummarizedExperiment, a matrix of splice junctions count, or a list of STAR output
#' splice junction files.
#'
#' See the vignette for examples of construction from different types.
#'
#' @param se a \code{RangedSummarizedExperiment} with columns of variables
#' indicating sample information in \code{colData}, and the counts as the first element in the
#' assays list, which will be renamed "counts". A \code{RangedSummarizedExperiment} object can
#' be generated by the function \code{summarizeOverlaps} in the GenomicAlignments package.
#' @param design a \code{character}.
#' the \code{character} indicate how to group the counts for each gene
#' depend on the variables in \code{colData}.
#' @param countData for matrix input: a matrix of non-negative integers
#' @param colData for matrix input: a \code{DataFrame} or \code{data.frame} with at least a single column.
#' Rows of colData correspond to columns of countData
#' @param ... arguments provided to \code{SummarizedExperiment} including rowRanges and metadata. Note that
#' for Bioconductor 3.1, rowRanges must be a GRanges or GRangesList, with potential metadata columns
#' as a DataFrame accessed and stored with \code{mcols}. If a user wants to store metadata columns
#' about the rows of the countData, but does not have GRanges or GRangesList information,
#' first construct the ICASDataSet without rowRanges and then add the DataFrame with \code{mcols(dds)}.
#'
#' @return A ICASDataSet object.
#'
#' @aliases ICASDataSet ICASDataSet-class ICASDataSetFromMatrix ICASDataSetFromSJFile
#'
#' @references See \url{https://github.com/alexdobin/STAR} for STAR
#'
#' @docType class
#'
#' @examples
#'
#' countData <- matrix(1:100,ncol=4)
#' condition <- factor(c("A","A","B","B"))
#' dds <- ICASDataSetFromMatrix(countData, DataFrame(condition), condition)
#'
#' @rdname ICASDataSet
#' @import methods
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom utils packageVersion
#' @importClassesFrom SummarizedExperiment RangedSummarizedExperiment
#' @export

ICASDataSet <- function(se, design) {
  if (!is(se, "RangedSummarizedExperiment")) {
    if (is(se, "SummarizedExperiment")) {
      se <- as(se, "RangedSummarizedExperiment")
    } else {
      stop("'se' must be a RangedSummarizedExperiment object")
    }
  }
  if (is.null(SummarizedExperiment::assayNames(se)) || SummarizedExperiment::assayNames(se)[1] != "counts") {
    message("renaming the first element in assays to 'counts'")
    SummarizedExperiment::assayNames(se)[1] <- "counts"
  }

  # before validity check, try to convert assay to integer mode
  if (any(is.na(SummarizedExperiment::assay(se))))
    stop("NA values are not allowed in the count matrix")
  if (any(SummarizedExperiment::assay(se) < 0)) {
    stop("some values in assay are negative")
  }
  if (!is.integer(SummarizedExperiment::assay(se))) {
    if (!is.numeric(SummarizedExperiment::assay(se))) {
      stop(paste("counts matrix should be numeric, currently it has mode:", mode(SummarizedExperiment::assay(se))))
    }
    if (any(round(SummarizedExperiment::assay(se)) != SummarizedExperiment::assay(se))) {
      stop("some values in assay are not integers")
    }
    message("converting counts to integer mode")
    mode(SummarizedExperiment::assay(se)) <- "integer"
  }

  if (all(SummarizedExperiment::assay(se) == 0)) {
    stop("all samples have 0 counts for all genes. check the counting script.")
  }

  if (all(rowSums(SummarizedExperiment::assay(se) == SummarizedExperiment::assay(se)[, 1]) == ncol(se))) {
    warning("all genes have equal values for all samples. will not be able to perform differential analysis")
  }

  if (any(duplicated(rownames(se)))) {
    warning(sum(duplicated(rownames(se)))," duplicate rownames were renamed by adding numbers")
    rnms <- rownames(se)
    dups <- unique(rnms[duplicated(rnms)])
    for (rn in dups) {
      idx <- which(rnms == rn)
      rnms[idx[-1]] <- paste(rnms[idx[-1]], c(seq_len(length(idx) - 1)), sep=".")
    }
    rownames(se) <- rnms
  }

  if(length(design) != 1) {
    stop("variable of design must be one element.")
  }

  if(!design %in% names(SummarizedExperiment::colData(se))) {
    stop("the variable of design must be a column in colData")
  }

  if (anyNA(SummarizedExperiment::colData(se)[[design]])) {
    stop(paste("variable of design cannot contain NA:", design))
  }

  if(class(SummarizedExperiment::colData(se)[[design]]) != "factor") {
    stop("the design variable are character vectors. convert these columns of colData(object)
         to factors before including in the design")
  }

  if(any(table(colData(se)[[design]]) == 0)) {
    message("factor levels were dropped which had no samples")
    colData(se)[[design]] <- droplevels(colData(se)[[design]])
  }

  if(all(colData(se)[[design]] == colData(se)[[design]][1])) {
    warning("design variable with all samples having the same value,
            remove these variables from the design")
  }


  # Add columns on the columns
  mcolsCols <- DataFrame(type = rep("input", ncol(SummarizedExperiment::colData(se))),
                         description = rep("", ncol(SummarizedExperiment::colData(se))))
  S4Vectors::mcols(SummarizedExperiment::colData(se)) <- if (is.null(S4Vectors::mcols(SummarizedExperiment::colData(se)))) {
    mcolsCols
  } else if (all(names(S4Vectors::mcols(SummarizedExperiment::colData(se))) == c("type", "description"))) {
    mcolsCols
  } else {
    cbind(S4Vectors::mcols(SummarizedExperiment::colData(se)), mcolsCols)
  }

  object <- new("ICASDataSet", se, design = design)

  # now we know we have at least an empty GRanges or GRangesList for rowRanges
  # so we can create a metadata column 'type' for the mcols
  # and we label any incoming columns as 'input'

  # this is metadata columns on the rows
  mcolsRows <- DataFrame(type = rep("input", ncol(SummarizedExperiment::mcols(object))),
                         description = rep("", ncol(SummarizedExperiment::mcols(object))))
  S4Vectors::mcols(SummarizedExperiment::mcols(object)) <- if (is.null(S4Vectors::mcols(SummarizedExperiment::mcols(object)))) {
    mcolsRows
  } else if (all(names(S4Vectors::mcols(SummarizedExperiment::mcols(object))) == c("type", "description"))) {
    mcolsRows
  } else {
    cbind(S4Vectors::mcols(SummarizedExperiment::mcols(object)), mcolsRows)
  }

  # stash the package version
  S4Vectors::metadata(object)[["version"]] <- packageVersion("ICAS")
  validObject(object)
  return(object)
}


#' @rdname ICASDataSet
#' @export

ICASDataSetFromMatrix <- function(countData, colData, design, ... ) {

  # check that these agree in number
  stopifnot(ncol(countData) == nrow(colData))

  # we expect a matrix of counts, which are non-negative integers
  countData <- as.matrix(countData)

  if (is(colData, "data.frame"))
    colData <- as(colData, "DataFrame")

  if (!design %in% names(colData)) {
    stop("the variable of design must be a column in colData")
  }

  if(class(colData[[design]]) != "factor") {
    stop("the design variable are character vectors. convert these columns of colData
         to factors before including in the design")
  }

  # check if the rownames of colData are simply in different order
  # than the colnames of the countData, if so throw an error
  # as the user probably should investigate what's wrong
  if (!is.null(rownames(colData)) & !is.null(colnames(countData))) {
    if (all(sort(rownames(colData)) == sort(colnames(countData)))) {
      if (!all(rownames(colData) == colnames(countData))) {
        stop(paste("rownames of the colData:",
                   paste(rownames(colData), collapse = ","),
                   "are not in the same order as the colnames of the countData:",
                   paste(colnames(countData), collapse = ",")))
      }
    }
  }
  if (is.null(rownames(colData)) & !is.null(colnames(countData))) {
    rownames(colData) <- colnames(countData)
  }

  se <- SummarizedExperiment(assays = S4Vectors::SimpleList(counts = countData), colData = colData, ...)
  object <- ICASDataSet(se, design = design)
  SummarizedExperiment::colData(object)$nSJs <- colSums(counts(object) > 0)
  SummarizedExperiment::colData(object)$nSJReads <- colSums(counts(object))

  # stash the package version
  S4Vectors::metadata(object)[["version"]] <- packageVersion("ICAS")
  validObject(object)
  return(object)
}

#' @rdname ICASDataSet
#' @export
#' @param SJFiles A character vector containing the names of the STAR outputed **SJ.out.tab**
#' files in the specified directories.
#' @param postfix The postfix of STAR outputed **SJ.out.tab**, [default is SJ.out.tab]
#' @param uniqMapOnly a logical value indicating whether only the uniquely mapping reads crossing
#' the junction to be used
#' @param annotationOnly a logical value indicating whether only the annotated reads crossing
#' the junction to be used
#' @param minSJRead the minimum number of reads crossing each junction of each sample
#' @param minOverhang the minimum length of maximum spliced alignment overhang of each junction of
#' each sample
#' @param minSJRowSum the minimum number of reads crossing each junction of all samples
#'


ICASDataSetFromSJFile <- function(SJFiles,
                                  postfix = "SJ.out.tab",
                                  uniqMapOnly = TRUE,
                                  annotationOnly = FALSE,
                                  minSJRead = 2,
                                  minOverhang = 3,
                                  minSJRowSum = 100,
                                  colData,
                                  design, ... ) {

  # check that these agree in number
  stopifnot(length(SJFiles) == nrow(colData))

  if (is(colData, "data.frame"))
    colData <- as(colData, "DataFrame")

  if (!design %in% names(colData)) {
    stop("the variable of design must be a column in colData")
  }

  if(class(colData[[design]]) != "factor") {
    stop("the design variable are character vectors. convert these columns of colData
         to factors before including in the design")
  }

  # check if the rownames of colData are simply in different order
  # than the colnames of the countData, if so throw an error
  # as the user probably should investigate what's wrong
  if (!is.null(rownames(colData)) & !is.null(gsub(postfix, "", basename(SJFiles)))) {
    if (all(sort(rownames(colData)) == sort(gsub(postfix, "", basename(SJFiles))))) {
      if (!all(rownames(colData) == gsub(postfix, "", basename(SJFiles)))) {
        stop(paste("rownames of the colData:",
                   paste(rownames(colData), collapse = ","),
                   "are not in the same order as the prefix of the SJFiles:",
                   paste(gsub(postfix, "", basename(SJFiles)), collapse = ",")))
      }
    }
  }

  countData <- multiMerge(SJFiles = SJFiles,
                          uniqMapOnly = uniqMapOnly,
                          minSJRead = minSJRead,
                          minSJRowSum = minSJRowSum,
                          annotationOnly = annotationOnly,
                          minOverhang = minOverhang,
                          postfix = postfix)

  # we expect a matrix of counts, which are non-negative integers
  countData <- as.matrix(countData)

  # check if the rownames of colData are simply in different order
  # than the colnames of the countData, if so throw an error
  # as the user probably should investigate what's wrong
  if (!is.null(rownames(colData)) & !is.null(colnames(countData))) {
    if (all(sort(rownames(colData)) == sort(colnames(countData)))) {
      if (!all(rownames(colData) == colnames(countData))) {
        stop(paste("rownames of the colData:",
                   paste(rownames(colData), collapse = ","),
                   "are not in the same order as the colnames of the countData:",
                   paste(colnames(countData), collapse = ",")))
      }
    }
  }

  se <- SummarizedExperiment(assays = S4Vectors::SimpleList(counts = countData), colData = colData, ...)
  object <- ICASDataSet(se, design = design)
  SummarizedExperiment::colData(object)$nSJs <- colSums(counts(object) > 0)
  SummarizedExperiment::colData(object)$nSJReads <- colSums(counts(object))

  # stash the package version
  S4Vectors::metadata(object)[["version"]] <- packageVersion("ICAS")
  validObject(object)
  return(object)
}
tangchao7498/ICAS documentation built on Jan. 28, 2021, 3:56 p.m.