R/CAGEexp.R

#' CAGEr class to hold all data and metadata about one CAGE experiment.
#' 
#' The [`CAGEr`] class is a [`MultiAssayExperiment`] object containing all data
#' and metadata about a set of CAGE libraries.  It replaced the CAGEset class
#' in 2017.  The main difference is that the expression data is stored
#' in [`DataFrame`] objects of [`Rle`]-encoded expression values, instead of plain
#' `data.frame`s.  With large datasets, this saves considerable amounts of memory.
#' 
#' @slot metadata A list that must at least contain a `genomeName` member.
#' 
#' @details If `genomeName` is `NULL`, checks of chromosome names will be
#' disabled and G-correction will not be possible.  See
#' <https://support.bioconductor.org/p/86437/> for an example on how to create a
#' _BSgenome_ package.
#' 
#' Sample labels must be _syntactically valid_ in the sense of the [make.names()]
#' function, because they will be used as column names in some tables.
#' 
#' @examples 
#' pathsToInputFiles <- list.files( system.file("extdata", package = "CAGEr")
#'                                , "ctss$"
#'                                , full.names = TRUE)
#' sampleLabels <- sub( ".chr17.ctss", "", basename(pathsToInputFiles))
#' 
#' # The CAGEexp object can be created using specific constructor commands
#'                               
#' exampleCAGEexp <-
#'   CAGEexp( genomeName     = "BSgenome.Drerio.UCSC.danRer7"
#'          , inputFiles     = pathsToInputFiles
#'          , inputFilesType = "ctss"
#'          , sampleLabels   = sub( ".chr17.ctss", "", basename(pathsToInputFiles)))
#'          
#' # Alternatively, it can be created just like another MultiAssayExperiment.
#' # This is useful when providing pre-existing colData with many columns.
#' 
#' exampleCAGEexp <-
#'   CAGEexp( metadata = list(genomeName = "BSgenome.Drerio.UCSC.danRer7")
#'          , colData  = DataFrame( inputFiles     = pathsToInputFiles
#'                                , sampleLabels   = sampleLabels
#'                                , inputFilesType = "ctss"
#'                                , row.names      = sampleLabels))
#' 
#' 
#' # Expression data is loaded by the getCTSS() function, that also calculates
#' # library sizes and store them in the object's column data.
#' 
#' exampleCAGEexp <- getCTSS(exampleCAGEexp)
#' librarySizes(exampleCAGEexp)
#' colData(exampleCAGEexp)
#' 
#' # CTSS data is stored internally as a SummarizedExperiemnt that can be retreived
#' # as a whole, or as GRanges, or as an expression DataFrame.
#' 
#' CTSStagCountSE(exampleCAGEexp)
#' CTSScoordinatesGR(exampleCAGEexp)
#' CTSStagCountDF(exampleCAGEexp)
#' 
#' # Columns of the "colData" table are accessible directly via the "$" operator.
#' 
#' exampleCAGEexp$l1 <- CTSStagCountDF(exampleCAGEexp) |> sapply ( \(col) sum(col > 0) )
#' exampleCAGEexp$l1
#' 
#' @seealso [`make.names`]
#' 
#' @rdname CAGEexp-class
#' @aliases CAGEexp-class
#' @import MultiAssayExperiment
#' @import SummarizedExperiment
#' @importFrom BSgenome installed.genomes
#' @importFrom S4Vectors SimpleList
#' @export CAGEexp
#' @exportClass CAGEexp

CAGEexp <- setClass("CAGEexp",
  contains = "MultiAssayExperiment",
  validity = function(object) {
    #		if(!(object@genomeName %in% suppressWarnings(suppressMessages(BSgenome::installed.genomes()()))))
    #		return("'genomeName' must be a name of one of the genome packages available in BSgenome! See 'BSgenome::installed.genomes()'")
    #		if(object@genomeName %in% rownames(installed.packages()) == FALSE)
    #		return("Requested genome is not installed! Please install required BSgenome package before running CAGEr.")
    
    if (! is.null(genomeName(object)))
      if (! genomeName(object) %in% installed.genomes())
        return( paste0(sQuote("genomeName"), " must be the name an installed genome package. "
              , "See ", sQuote("BSgenome::installed.genomes()")
              , " and ", sQuote("BSgenome::available.genomes()"), "."))
    
    if (is.null(colData(object)$inputFiles))
      return("Missing input file list.")
    
    if (is.null(object$inputFilesType))
      return("Missing input file type.")
    
    supportedTypes <- c("bam", "bamPairedEnd", "bed", "bedScore", "bedctss", "CAGEscanMolecule", "ctss", "CTSStable")
    
    if (! all(inputFilesType(object) %in% supportedTypes))
      return( paste(sQuote("inputFilesType"), "must be one of supported input file types:"
            , paste(sQuote(supportedTypes), collapse = ", "), "."))
 
    if (is.null(rownames(colData(object))))
      return("Rownames are missing in colData().  Did you forget to specify them?")
       
    if (is.null(colData(object)$sampleLabels))
      return("Missing sample labels.")
    
    if( !(identical(colData(object)$sampleLabels, make.names(colData(object)$sampleLabels))))
      stop( "Names of samples must be syntactically valid (see "
          , dQuote("?make.names"), ") because they will be used as column names "
          , "in some tables.  In brief, avoid minus signs and do not start with "
          , "numbers.")
    
    if (!(all(nzchar(colData(object)$sampleLabels))) |
        !(all(substr(colData(object)$sampleLabels, start = 1, stop = 1) %in%c(letters, LETTERS))))
      return("All sample labels must be a non-empty strings beginning with a letter!")
    
    if (length(unique(colData(object)$sampleLabels)) !=
        length(colData(object)$sampleLabels)) 
      return("Duplicated sample labels are not allowed!")
  }
)

setMethod( "initialize", "CAGEexp"
         , function( .Object
                   , ...
                   , metadata       = list()
                   , colData        = S4Vectors::DataFrame()
                   , genomeName     = NULL
                   , inputFiles     = NULL
                   , inputFilesType = NULL
                   , sampleLabels   = NULL) {
  if (missing(colData)) {
    if (missing(inputFiles)) stop ("Please provide input files paths.")
    colData <- S4Vectors::DataFrame(inputFiles = inputFiles)
  }
  if (! missing(inputFiles))     colData$inputFiles     <- inputFiles
  if (! missing(inputFilesType)) colData$inputFilesType <- inputFilesType
  if (! missing(sampleLabels))   colData$sampleLabels   <- sampleLabels
  if (! missing(sampleLabels))   rownames(colData)      <- sampleLabels
  if (! missing(genomeName))     metadata$genomeName    <- genomeName
  callNextMethod( .Object
                , metadata = metadata
                , colData  = colData
                , ...)
})

#' Example CAGEexp object.
#' 
#' Lazy-loaded example CAGEexp object, containing most of the CAGEr data
#' structures created with the CAGEr modifier functions.
#' 
#' @format A [CAGEexp] object.
#' 
#' @examples 
#' \dontrun{
#' pathsToInputFiles <- list.files( system.file("extdata", package = "CAGEr")
#'                                , "ctss$"
#'                                , full.names = TRUE)
#' sampleLabels <- sub( ".chr17.ctss", "", basename(pathsToInputFiles))
#' exampleCAGEexp <-
#'   CAGEexp( genomeName     = "BSgenome.Drerio.UCSC.danRer7"
#'          , inputFiles     = pathsToInputFiles
#'          , inputFilesType = "ctss"
#'          , sampleLabels   = sub( ".chr17.ctss", "", basename(pathsToInputFiles)))
#' exampleCAGEexp <- getCTSS(exampleCAGEexp)
#' librarySizes(exampleCAGEexp)
#' colData(exampleCAGEexp)
#' exampleCAGEexp$l1 <- NULL
#' exampleCAGEexp <- exampleCAGEexp[,c(5, 2, 1, 3, 4)] # Non-aplhabetic order may help catch bugs
#' CTSStagCountSE(exampleCAGEexp) <- CTSStagCountSE(exampleCAGEexp)[1:5000,]  # Slim the object
#' exampleCAGEexp$librarySizes <- sapply(CTSStagCountDF(exampleCAGEexp), sum) # Repair metadata
#' exampleCAGEexp <- 
#'   summariseChrExpr(exampleCAGEexp)                |>
#'   annotateCTSS(exampleZv9_annot)                  |>
#'   CTSStoGenes()                                   |>
#'   normalizeTagCount()                             |>
#'   getExpressionProfiles("CTSS")                   |>
#'   clusterCTSS()                                   |>
#'   cumulativeCTSSdistribution("tagClusters")       |>
#'   quantilePositions("tagClusters")                |>
#'   aggregateTagClusters()                          |>
#'   annotateConsensusClusters(exampleZv9_annot)     |>
#'   cumulativeCTSSdistribution("consensusClusters") |>
#'   quantilePositions("consensusClusters")          |>
#'   getExpressionProfiles("consensusClusters")      |>
#'   scoreShift( groupX = c("Zf.unfertilized.egg")
#'             , groupY = "Zf.30p.dome"
#'             , testKS = TRUE, useTpmKS = FALSE)
#' save(exampleCAGEexp, file = "data/exampleCAGEexp.RData", compress = "xz")
#' }

"exampleCAGEexp"


#' @name coerce,data.frame,CAGEexp-method
#' @rdname CAGEexp-class

setAs("data.frame", "CAGEexp", function(from){
  
  if((ncol(from) < 4) | !(all(colnames(from)[1:3] == c("chr", "pos", "strand"))))
    stop( "First three columns of the input data.frame must contain chromosome name, "
        , "genomic position and strand of individual TSSs, and must be named 'chr', 'pos' "
        , "and 'strand', respectively!")
  
  if(ncol(from) <4)
    stop( "Input data.frame needs to contain at least one column with CAGE tag counts, "
        , "in addition to first three columns specifying chromosome name, genomic position "
        , "and strand of individual TSSs!")
  
  if(!(is.integer(from[,"pos"])))
    stop( "The 'pos' column in the input data.frame can contain only non-zero integers "
        , "as these are interpreted as 1-based genomic coordinates of TSSs! Make sure the "
        , "'pos' column is of class 'integer'!")
  
  if(any(from[,"pos"] <= 0))
    stop( "The 'pos' column in the input data.frame can contain only non-zero integers as "
        , "these are interpreted as 1-based genomic coordinates of TSSs!")
  
  if(!(all(from[,"strand"] %in% c("+", "-"))))
    stop( "The 'strand' column in the input data.frame can contain only '+' or '-'!")
  
  if(!(all(apply(from[,4:ncol(from),drop=FALSE], 2, is.integer))))
    stop( "The columns specifying CAGE tag counts must be non-negative integers! Make sure "
        , "these columns are of class 'integer'!")

  if(any(apply(from[,4:ncol(from),drop=FALSE], 2, function(x) {any(x < 0)})))
    stop("The columns specifying CAGE tag counts must be non-negative integers!")
  
  sample.labels <- colnames(from)[4:ncol(from)]
  gr            <- GRanges(from$chr, IRanges(from$pos, width = 1), from$strand)
  counts        <- DataFrame(lapply(from[,4:ncol(from),drop=FALSE], Rle))
  
  object <-
    CAGEexp( metadata = list(genomeName = NULL)
           , colData = DataFrame( inputFiles     = "data.frame"
                                , sampleLabels   = sample.labels
                                , inputFilesType = "CTSStable"
                                , row.names      = sample.labels))

  CTSStagCountSE(object) <-
    SummarizedExperiment( rowRanges = gr
                        , assays    = SimpleList(counts = counts))
  
  object$librarySizes <- as.integer(colSums(from[,4:ncol(from),drop=FALSE]))
  
  
  return(object)
})
charles-plessy/CAGEr documentation built on Nov. 4, 2023, 11:57 a.m.