R/ImportMethods.R

Defines functions .importPublicData_ZF .importPublicData_F5 .importPublicData .df2SE import.CAGEscanMolecule import.CTSS import.bedCTSS import.bedScore import.bedmolecule import.bam.ctss bam2CTSS import.bam moleculesGR2CTSS loadFileIntoGPos coerceInBSgenome addCTSScolumn toCTSSdt checkFilesExist

Documented in bam2CTSS coerceInBSgenome import.bam import.bam.ctss import.bedCTSS import.bedmolecule import.bedScore import.CAGEscanMolecule import.CTSS loadFileIntoGPos moleculesGR2CTSS

#' @include CAGEr.R CAGEexp.R ClusteringMethods.R CTSS.R GetMethods.R SetMethods.R

#' @name getCTSS
#' 
#' @title Reading CAGE data from input file(s) and detecting TSSs
#' 
#' @description Reads input CAGE datasets into CAGEr object, constructs CAGE
#' transcriptions start sites (CTSSs) and counts number of CAGE tags supporting every
#' CTSS in each input experiment.  See \code{\link{inputFilesType}} for details on
#' the supported input formats. Preprocessing and quality filtering of input CAGE
#' tags, as well as correction of CAGE-specific 'G' nucleotide addition bias can be
#' also performed before constructing TSSs.
#' 
#' @param object A [`CAGEexp`] object.
#' 
#' @param sequencingQualityThreshold Only CAGE tags with average sequencing quality
#'   \code{>= sequencingQualityThreshold} and mapping quality \code{>=
#'   mappingQualityThreshold} are kept. Used only if \code{inputFileType(object)
#'   == "bam"} or \code{inputFileType(object) == "bamPairedEnd"}, \emph{i.e} when
#'   input files are BAM files of aligned sequenced CAGE tags, otherwise ignored.  If
#'   there are no sequencing quality values in the BAM file (\emph{e.g.} HeliScope
#'   single molecule sequencer does not return sequencing qualities) all reads will by
#'   default have this value set to -1.  Since the default value of
#'   \code{sequencingQualityThreshold} is 10, all the reads will consequently be
#'   discarded. To avoid this behaviour and keep all sequenced reads set
#'   \code{sequencingQualityThreshold} to -1 when processing data without sequencing
#'   qualities.  If there is no information on mapping quality in the BAM file
#'   (\emph{e.g.} software used to align CAGE tags to the referent genome does not
#'   provide mapping quality) the \code{mappingQualityThreshold} parameter is ignored.
#'   In case of paired-end sequencing BAM file (i.e. \code{inputFileType(object) ==
#'   "bamPairedEnd"}) only the first mate of the properly paired reads (i.e. the five
#'   prime end read) will be read and subject to specified thresholds.
#'   
#' @param mappingQualityThreshold See sequencingQualityThreshold.
#' 
#' @param removeFirstG Logical, should the first nucleotide of the CAGE tag be removed
#'   in case it is a G and it does not map to the referent genome (\emph{i.e.} it is a
#'   mismatch).  Used only if \code{inputFileType(object) == "bam"} or
#'   \code{inputFileType(object) == "bamPairedEnd"}, \emph{i.e} when input files are
#'   BAM files of aligned sequenced CAGE tags, otherwise ignored.  See Details.
#' 
#' @param correctSystematicG Logical, should the systematic correction of the first G
#'   nucleotide be performed for the positions where there is a G in the CAGE tag and G
#'   in the genome.  This step is performed in addition to removing the first G of the
#'   CAGE tags when it is a mismatch, \emph{i.e.} this option can only be used when
#'   \code{removeFirstG = TRUE}, otherwise it is ignored.  The frequency of adding a G
#'   to CAGE tags is estimated from mismatch cases and used to systematically correct
#'   the G addition for positions with G in the genome.  Used only if
#'   \code{inputFileType(object) == "bam"} or \code{inputFileType(object) ==
#'   "bamPairedEnd"}, \emph{i.e} when input files are BAM files of aligned sequenced
#'   CAGE tags, otherwise ignored.  See Details.
#'   
#' @param useMulticore Logical, should multicore be used.
#'   \code{useMulticore = TRUE} has no effect on non-Unix-like platforms.
#'   
#' @param nrCores Number of cores to use when \code{useMulticore = TRUE}
#'   (set to \code{NULL} to use all detected cores).
#' 
#' @details In the CAGE experimental protocol an additional G nucleotide is often attached
#' to the 5' end of the tag by the template-free activity of the reverse transcriptase used
#' to prepare cDNA (Harbers and Carninci, Nature Methods 2005).  In cases where there is a
#' G at the 5' end of the CAGE tag that does not map to the corresponding genome sequence,
#' it can confidently be considered spurious and should be removed from the tag to avoid
#' misannotating actual TSS. Thus, setting \code{removeFirstG = TRUE} is highly recommended.
#' 
#' However, when there is a G both at the beginning of the CAGE tag and in the genome, it is
#' not clear whether the original CAGE tag really starts at this position or the G nucleotide
#' was added later in the experimental protocol.  To systematically correct CAGE tags mapping
#' at such positions, a general frequency of adding a G to CAGE tags can be calculated from
#' mismatch cases and applied to estimate the number of CAGE tags that have G added and
#' should actually start at the next nucleotide/position.  The option \code{correctSystematicG}
#' is an implementation of the correction algorithm described in Carninci \emph{et al.},
#' Nature Genetics 2006, Supplementary Information section 3-e.
#' 
#' @return Returns the object, in which the `tagCountMatrix` experiment will be
#' occupied by a [`RangedSummarizedExperiment`] containing the expression data
#' as a `DataFrame` of `Rle` integers, and the CTSS coordinates as genomic
#' ranges in a [`CTSS`] object.  The expression data can be retrieved with
#' the [`CTSStagCountDF`] function.  In addition, the library sizes are
#' calculated and stored in the object's sample data (see [`librarySizes`]).
#' 
#' @references 
#' 
#' Harbers and Carninci (2005) Tag-based approaches for transcriptome research and genome
#' annotation, \emph{Nature Methods} \bold{2}(7):495-502.
#' 
#' Carninci \emph{et al.} (2006) Genome-wide analysis of mammalian promoter architecture and
#' evolution, \emph{Nature Genetics} \bold{38}(7):626-635.
#' 
#' @author Vanja Haberle
#' 
#' @seealso \code{\link{inputFilesType}}, \code{\link{librarySizes}}.
#' 
#' @family CAGEr object modifiers
#' 
#' @examples
#' library(BSgenome.Drerio.UCSC.danRer7)
#' 
#' pathsToInputFiles <- system.file("extdata", c("Zf.unfertilized.egg.chr17.ctss",
#'   "Zf.30p.dome.chr17.ctss", "Zf.prim6.rep1.chr17.ctss"), package="CAGEr")
#'   
#' labels <- paste("sample", seq(1,3,1), sep = "")
#' 
#' myCAGEexp <- new("CAGEexp", genomeName = "BSgenome.Drerio.UCSC.danRer7",
#'  inputFiles = pathsToInputFiles, inputFilesType = "ctss", sampleLabels = labels)
#' 
#' myCAGEexp <- getCTSS(myCAGEexp)
#' 
#' @docType methods
#' 
#' @importFrom  S4Vectors DataFrame
#' @importFrom  S4Vectors Rle
#' @export

setGeneric( "getCTSS"
          , function( object
                    , sequencingQualityThreshold = 10, mappingQualityThreshold = 20
                    , removeFirstG = TRUE, correctSystematicG = TRUE
                    , useMulticore = FALSE, nrCores = NULL)
	            standardGeneric("getCTSS"))

checkFilesExist <- function(paths) {
  for (f in paths)
    if (isFALSE(grepl("^http", f)))
      if (! file.exists(f)) stop("Could not locate input file ", f)
}

#' toCTSSdt
#' 
#' Non-exported private helper function
#' 
#' @return Returns a \code{data.table} with TSS indicated in the \dQuote{chr},
#' \dQuote{pos} and \dQuote{strand} columns, and a score column named with
#' the sample label.
#' 
#' @noRd
#' @importFrom data.table data.table
#' @importFrom data.table setkeyv
#' @importFrom data.table setnames

toCTSSdt <- function(reads.GRanges, sample.label) {
  reads.GRanges.plus  <- reads.GRanges[strand(reads.GRanges) == "+"]
  reads.GRanges.minus <- reads.GRanges[strand(reads.GRanges) == "-"]
  gr2ctssdf <- function(gr, strand)
    data.frame( chr    = as.character(seqnames(gr))
              , pos    = if (strand == "+") start(gr) else end(gr)
              , strand = strand
              , stringsAsFactors = F)
  CTSS.plus <- gr2ctssdf(reads.GRanges.plus, "+")
	CTSS.minus <-gr2ctssdf(reads.GRanges.minus, "-")
  CTSS <- rbind(CTSS.plus, CTSS.minus)
	CTSS$tag_count <- 1L
	CTSS <- .byCtss(data.table(CTSS), "tag_count", sum)
	setnames(CTSS, c("chr", "pos", "strand", sample.label))
	setkeyv(CTSS, c("chr", "pos", "strand"))
	CTSS
}

addCTSScolumn <- function(CTSS.all.samples, CTSS) {
  if(is.null(CTSS.all.samples))
    return(CTSS)
  merge(CTSS.all.samples, CTSS, all.x = TRUE, all.y = TRUE)
}

#' coerceInBSgenome
#' 
#' A private (non-exported) function to discard any range that is
#' not compatible with the CAGEr object's BSgenome.
#' 
#' @param gr The genomic ranges to coerce.
#' @param genome The name of a BSgenome package, which must me installed,
#'   or \code{NULL} to skip coercion.
#' 
#' @return A GRanges object in which every range is guaranteed to be compatible
#' with the given BSgenome object.  The sequnames of the GRanges are also set
#' accordingly to the BSgenome.
#' 
#' @importFrom GenomeInfoDb seqinfo
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom GenomeInfoDb seqlevels seqlevels<-
#' @importFrom GenomeInfoDb seqnames
#' @importFrom S4Vectors %in%

coerceInBSgenome <- function(gr, genome) {
  if (is.null(genome)) return(gr)
  genome <- getRefGenome(genome)
  gr <- gr[seqnames(gr) %in% seqnames(genome)]
  gr <- gr[! end(gr) > seqlengths(genome)[as.character(seqnames(gr))]]
  seqlevels(gr) <- seqlevels(genome)
  seqinfo(gr) <- seqinfo(genome)
  gr
}

#' loadFileIntoGPos
#' 
#' A private (non-exported) function to load from each file format supported by CAGEr
#' 
#' @param filepath The path to the file to load.
#' @param filetype The type of the file
#' @param sequencingQualityThreshold See getCTSS().
#' @param mappingQualityThreshold See getCTSS().
#' @param removeFirstG See getCTSS().
#' @param correctSystematicG See getCTSS().
#' @param genome See coerceInBSgenome().
#' 
#' @return A [GPos()] object where the score represents the number of CAGE tags
#' starting on that nucleotide.
#' 
#' @seealso import.CTSS
#' @family loadFileIntoGPos

loadFileIntoGPos    <- function( filepath
                               , filetype = c( "bam", "bamPairedEnd"
                                             , "bed", "bedctss", "bedScore"
                                             , "CAGEscanMolecule", "ctss")
                               , sequencingQualityThreshold
                               , mappingQualityThreshold
                               , removeFirstG
                               , correctSystematicG
                               , genome) {
  if (missing(filetype)) stop("Please specify the file type.")
  filetype <- match.arg(filetype)
  switch( filetype
        , bam              = import.bam.ctss( filepath = filepath
                                            , filetype = "bam"
                                            , sequencingQualityThreshold = sequencingQualityThreshold
                                            , mappingQualityThreshold = mappingQualityThreshold
                                            , removeFirstG = removeFirstG
                                            , correctSystematicG = correctSystematicG
                                            , genome = genome)
        , bamPairedEnd     = import.bam.ctss( filepath = filepath
                                            , filetype = "bamPairedEnd"
                                            , sequencingQualityThreshold = sequencingQualityThreshold
                                            , mappingQualityThreshold = mappingQualityThreshold
                                            , removeFirstG = removeFirstG
                                            , correctSystematicG = correctSystematicG
                                            , genome = genome)
        , bed              = import.bedmolecule(filepath)
        , bedScore         = import.bedScore(filepath)
        , bedctss          = import.bedCTSS(filepath)
        , CAGEscanMolecule = import.CAGEscanMolecule(filepath)
        , ctss             = import.CTSS(filepath))
}

#' moleculesGR2CTSS
#' 
#' Calculates CTSS positions from a GenomicRanges object where each element
#' represents a single molecule.
#' 
#' @param gr A [GRanges] object.
#' 
#' @return Returns a [GRanges] object.
#' 
#' @family loadFileIntoGPos
#' 
#' @importFrom S4Vectors Rle
#' 
#' @examples
#' gr <- GenomicRanges::GRanges("chr1", IRanges::IRanges(1, 10), c("+", "-", "+"))
#' CAGEr:::moleculesGR2CTSS(gr)

moleculesGR2CTSS <- function(gr) {
  tb <- table(promoters(gr, 0, 1))
  gp <- as(names(tb), "UnstitchedGPos")
  score(gp) <- Rle(as.integer(tb))
  gp
}

#' import.bam
#' 
#' Imports CTSS data from a BAM file.
#' 
#' @param filepath The path to the BAM file.
#' @param filetype bam or bamPairedEnd.
#' @param sequencingQualityThreshold See getCTSS().
#' @param mappingQualityThreshold See getCTSS().
#' 
#' @family loadFileIntoGPos
#' 
#' @importFrom Rsamtools bamFlag<-
#' @importFrom Rsamtools ScanBamParam
#' @importFrom Rsamtools scanBamFlag
#' @importFrom GenomicAlignments readGAlignments
#' @importFrom GenomicAlignments qwidth
#' 
#' @examples
#' # TODO: add exmaple file
#' # import.bam(system.file("extdata", "example.bam", package = "CAGEr"))

import.bam <- function( filepath
                      , filetype
                      , sequencingQualityThreshold = 10
                      , mappingQualityThreshold = 20) {
  param <- ScanBamParam( what       = c("seq", "qual")
                       , flag       = scanBamFlag(isUnmappedQuery = FALSE)
                       , mapqFilter = mappingQualityThreshold)
  if (filetype == "bamPairedEnd")
    bamFlag(param) <- scanBamFlag( isUnmappedQuery = FALSE
                                 , isProperPair    = TRUE
                                 , isFirstMateRead = TRUE)
  bam <- readGAlignments(filepath, param = param)
  message("\t-> Filtering out low quality reads...")
  # We need to loop on qualities because there is a hard limit on the length.
  # See <https://support.bioconductor.org/p/97993/>.
  qual <- mcols(bam)$qual
  start <- 1
  chunksize <- 1e6
  qual.avg <- vector(mode = "integer")
  repeat {
    if (start + chunksize <= length(qual)) {
      end <- start + chunksize
    } else {
      end <- length(qual)
    }
    qual.avg <- c(qual.avg, as.integer(mean(as(qual[start:end], "IntegerList"))))
    if (end == length(qual)) {
      break
    } else {
      start <- end + 1
    }
  }
  bam <- bam[qual.avg >= sequencingQualityThreshold]
  gr <- GRanges(bam)
  gr$read.length <- qwidth(bam)
  gr
}

#' bam2CTSS
#' 
#' Converts from BAM to CTSS
#' 
#' Converts genomic ranges representing SAM/BAM alignments into a CTSS object.
#' 
#' @param gr A \code{\link{GRanges}} object returned by \code{\link{import.bam}}.
#' @param removeFirstG See getCTSS().
#' @param correctSystematicG See getCTSS().
#' @param genome See coerceInBSgenome().
#' 
#' @return Returns a \code{\link{CTSS}} object.
#' 
#' @importFrom GenomeInfoDb bsgenomeName
#' @family loadFileIntoGPos

bam2CTSS <- function(gr, removeFirstG, correctSystematicG, genome) {
  gr <- coerceInBSgenome(gr, genome)
  if(removeFirstG == TRUE) {
    message("\t-> Removing the first base of the reads if 'G' and not aligned to the genome...")
    if (isFALSE(correctSystematicG)) {
      gp <- .remove.added.G.CTSS(gr, genome, correctSystematicG = FALSE)
    } else {
      # Using the old CAGEset function that returns a data.table
      gp <- .remove.added.G(gr, genome, correctSystematicG = TRUE, "score")
      return(gp)
    }
  } else {
    gp <- CTSS(promoters(gr, 0, 1))
  }
  tb <- table(gp)
  gp <- CTSS(names(tb), bsgenomeName = genome)
  score(gp) <- Rle(unclass(tb))
  gp
} 

#' import.bam.ctss
#' 
#' Imports CTSS data from a BAM file.
#' 
#' @param filepath The path to the BAM file.
#' @param filetype bam or bamPairedEnd.
#' @param sequencingQualityThreshold See getCTSS().
#' @param mappingQualityThreshold See getCTSS().
#' @param removeFirstG See getCTSS().
#' @param correctSystematicG See getCTSS().
#' @param genome See coerceInBSgenome().
#' 
#' @return Returns a [CTSS] object.
#' 
#' @family loadFileIntoGPos

import.bam.ctss <- function( filepath, filetype, sequencingQualityThreshold
                           , mappingQualityThreshold, removeFirstG
                           , correctSystematicG, genome) {
  bam <- import.bam( filepath = filepath
                   , filetype = filetype
                   , sequencingQualityThreshold = sequencingQualityThreshold
                   , mappingQualityThreshold = mappingQualityThreshold)
  bam2CTSS( gr= bam
          , removeFirstG = removeFirstG
          , correctSystematicG = correctSystematicG
          , genome = genome)
}

#' import.bedmolecule
#' 
#' Imports a BED file where each line counts for one molecule in a
#' GRanges object where each line represents one nucleotide.
#' 
#' @param filepath The path to the BED file.
#' 
#' @return Returns a \code{\link{CTSS}} object.
#' 
#' @family loadFileIntoGPos
#' 
#' @importFrom rtracklayer import.bed
#' @importFrom S4Vectors Rle
#' 
#' @examples
#' # TODO: add exmaple file
#' # import.BED(system.file("extdata", "example.bed", package = "CAGEr"))

import.bedmolecule <- function(filepath) {
  moleculesGR2CTSS(rtracklayer::import.bed(filepath))
}

#' import.bedScore
#' 
#' Imports a BED file where the score indicates a number of counts
#' for a given alignment.
#' 
#' @param filepath The path to the BED file.
#' 
#' @return A GRanges object where each line represents one nucleotide.
#' 
#' @family loadFileIntoGPos
#' 
#' @importFrom rtracklayer import.bed
#' @importFrom S4Vectors Rle
#' 
#' @examples
#' # TODO: add exmaple file
#' # import.bedScore(system.file("extdata", "example.bed", package = "CAGEr"))

import.bedScore <- function(filepath) {
  gr <- rtracklayer::import.bed(filepath)
  df <- data.frame( p = as.character(promoters(gr, 0, 1))
                  , s = score(gr))
  df <- rowsum(df$s, df$p)
  gp <- as(rownames(df), "UnstitchedGPos")
  score(gp) <- Rle(as.integer(df[,1]))
  gp
}

#' import.bedCTSS
#' 
#' Imports a BED file where each line represents a single base, with a score
#' counting the number of CAGE transcription start sites (CTSS).
#' 
#' @return A GRanges object where each line represents one nucleotide.
#' 
#' @param filepath The path to the BED file.
#' 
#' @family loadFileIntoGPos
#' 
#' @importFrom rtracklayer import.bed
#' @importFrom GenomeInfoDb sortSeqlevels
#' @importFrom GenomicRanges countOverlaps
#' 
#' @examples
#' # TODO: add exmaple file
#' # import.BED(system.file("extdata", "example.bed", package = "CAGEr"))

import.bedCTSS <- function(filepath) {
  gr <- rtracklayer::import.bed(filepath)
  if (length(unique(gr)) != length(gr))
    stop("Input must not contain the same range multiple times.")
  if (any(countOverlaps(gr) != 1))
    stop("Input must not contain overlapping ranges.")
  seqlevels(gr) <- sortSeqlevels(seqlevels(gr))
  mcols(gr) <- DataFrame(score = Rle(as.integer(score(gr))))
  as(gr, "UnstitchedGPos")
}

#' import.CTSS
#' 
#' Imports a "CTSS" file in a [GPos] object
#' 
#' @param filepath The path to the "CTSS" file.
#' 
#' Note that the format of the "CTSS" files handled in this function is not
#' the same as the FANTOM5 "CTSS" files (which are plain BED).
#' 
#' @family loadFileIntoGPos
#' 
#' @importFrom utils read.table
#' 
#' @examples
#' CAGEr:::import.CTSS(system.file("extdata", "Zf.high.chr17.ctss", package = "CAGEr"))

import.CTSS <- function(filepath) {
  CTSS <- read.table( file = filepath
                      , header = F
                      , sep = "\t"
                      , col.names  = c("chr",       "pos",     "strand",    "score")
                      , colClasses = c("character", "integer", "character", "integer"))
  gp <- GPos( stitch = FALSE
            , GRanges( seqnames = CTSS$chr
                     , ranges   = IRanges(CTSS$pos, width = 1)
                     , strand   = CTSS$strand))
  score(gp) <- CTSS$score
  gp
}

#' parseCAGEscanBlocksToGrangeTSS
#' 
#' Parse a string describing a block in a CAGEscan molecule, as output by
#' the "CAGEscan 3.0" pipeline.
#' 
#' @param blocks A character string representing a block in a CAGEscan molecule.
#' 
#' @return A GRanges object representing a TSS.
#' 
#' In CAGEscan molecules, blocks are separated by \sQuote{|}, \sQuote{,} or
#' \sQuote{;} for gap of coverage, splice junction (confident) and splice
#' junction (maybe) respectively.  Strand is "+" if first coordinate is lower
#' than the second one, and "-" otherwise.
#' 
#' @seealso import.CAGEscanMolecule
#' 
#' @examples
#' myMolecule <- paste0( "chr11:66268633-66268693,"
#'                     , "chr11:66271796-66271869;"
#'                     , "chr11:66272156-66272252|"
#'                     , "chr11:66272364-66272460")
#' myFirstBlock <- sub("[,;|].*", "", myMolecule)
#' 
#' CAGEr:::parseCAGEscanBlocksToGrangeTSS(myFirstBlock)

parseCAGEscanBlocksToGrangeTSS <- function (blocks) {
  blocks <- strsplit(blocks, "[:-]")
  chr    <- unlist(lapply(blocks, `[[`, 1))
  fst    <- as.integer(unlist(lapply(blocks, `[[`, 2)))
  snd    <- as.integer(unlist(lapply(blocks, `[[`, 3)))
  strand <- ifelse(fst < snd, "+", "-")
  start  <- pmin(fst, snd)
  GPos(stitch=FALSE, GRanges(chr, IRanges(start, width = 1), strand))
}

#' import.CAGEscanMolecule
#' 
#' Imports a CAGEscan \dQuote{molecule} file in a [`GRanges`] object
#' 
#' @param filepath The path to the \dQuote{molecule} file.
#' 
#' @seealso parseCAGEscanBlocksToGrangeTSS
#' 
#' @importFrom GenomicRanges GRangesList
#' 
#' @examples
#' # TODO import.CAGEscanMolecule(system.file("extdata", "example.molecule.txt", package = "CAGEr"))
#' 
#' @importFrom data.table fread

import.CAGEscanMolecule <- function(filepath) {
  molecules <- unname(unlist(data.table::fread(select = 9, paste( "grep -v \\#", filepath))))
  molecules <- sub("[,;|].*", "", molecules)
  parseCAGEscanBlocksToGrangeTSS(molecules)
}

#' @rdname getCTSS

setMethod( "getCTSS", "CAGEexp"
         , function ( object
                    , sequencingQualityThreshold, mappingQualityThreshold
                    , removeFirstG, correctSystematicG
                    , useMulticore, nrCores) {
  if(isFALSE(removeFirstG) & isTRUE(correctSystematicG))
    stop("correctSystematicG can not be TRUE while removeFirstG is FALSE")

  # Step 0: Test existance of each file before spending time loading them.
  
  checkFilesExist(inputFiles(object))
  
  # Step 1: Load each file as GRangesList where each GRange is a CTSS data.
  
  l <- as.list(seq_along(inputFiles(object)))

  l <- bplapply(l, function(i) {
      message("\nReading in file: ", inputFiles(object)[i], "...")
      gr <-    loadFileIntoGPos( filepath                   = inputFiles(object)[i]
                               , filetype                   = inputFilesType(object)[i]
                               , sequencingQualityThreshold = sequencingQualityThreshold
                               , mappingQualityThreshold    = mappingQualityThreshold
                               , removeFirstG               = removeFirstG
                               , correctSystematicG         = correctSystematicG
                               , genome                     = genomeName(object))
      gr <- coerceInBSgenome(gr, genomeName(object))
      l[[i]] <- sort(gr)
    }, BPPARAM = CAGEr_Multicore(useMulticore, nrCores))
  
  l <-  GRangesList(l)
  
  # Step 2: Create GPos representing all the nucleotides with CAGE counts in the list.

  rowRanges <- sort(unique(unlist(l)))
  mcols(rowRanges) <- NULL

  # Step 3: Fold the GRangesList in a expression DataFrame of Rle-encoded counts.
  
  assay <- DataFrame(V1 = Rle(rep(0L, length(rowRanges))))
  
  expandRange <- function(global, local) {
    x <- Rle(rep(0L, length(global)))
    x[global %in% local] <- score(local)
    x
  }
  
  for (i in seq_along(l))
    assay[,i] <- expandRange(rowRanges, l[[i]])
  
  rowRanges <- new("CTSS", rowRanges, bsgenomeName = genomeName(object))
  
  colnames(assay) <- sampleLabels(object)
  
  # Setp 4: Put the data in the appropriate slot of the MultiAssayExperiment.

  CTSStagCountSE(object) <-
    SummarizedExperiment( rowRanges = rowRanges
                        , assays    = SimpleList(counts = assay))
  
  # Step 5: update the sample metadata (colData).
  
  librarySizes(object) <- unlist(lapply(CTSStagCountDF(object), sum))
  
  # Setp 6: Return the modified object.
  
  object
})

#' FANTOM5 human samples
#' 
#' Lazy-loaded `data.frame` object, containing information about FANTOM5
#' libraries.  Its use is described in more details in the vignette
#' \dQuote{Use of CAGE resources with CAGEr}.
#' 
#' @format A `data.frame` with `sample`, `type`, `description`, `library_id`
#' and `data_url` columns.
#' 
#' @family FANTOM data
"FANTOM5humanSamples"

#' FANTOM5 mouse samples
#' 
#' Lazy-loaded `data.frame` object, containing information about FANTOM5
#' libraries.  Its use is described in more details in the vignette
#' \dQuote{Use of CAGE resources with CAGEr}.
#' 
#' @format A `data.frame` with `sample`, `type`, `description`, `library_id`
#' and `data_url` columns.
#' 
#' @family FANTOM data
"FANTOM5mouseSamples"


#' importPublicData
#' 
#' Imports CAGE data from different sources into a [`CAGEexp`] object.  After
#' the object has been created the data can be further manipulated and
#' visualized using other functions available in the _CAGEr_ package and
#' integrated with other analyses in R.  Available resources include:
#' 
#'   - FANTOM5 datasets (Forrest _et al.,_ Nature 2014) for numerous human and
#'   mouse samples (primary cells, cell lines and tissues), which are fetched
#'   directly from FANTOM5 online resource at
#'   \href{https://fantom.gsc.riken.jp/5/data}{https://fantom.gsc.riken.jp/5/data}.
#'   
#'   - FANTOM3 and 4 datasets (Carninci _et al., _ Science 2005, Faulkner
#'   _et al.,_ Nature Genetics 2009, Suzuki _et al._ Nature Genetics 2009) from
#'   _FANTOM3and4CAGE_ data package available from Bioconductor.
#'   
#'   - ENCODE datasets (Djebali _et al._ Nature 2012) for numerous human cell
#'   lines from _ENCODEprojectCAGE_ data package, which is available for
#'   download from \href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}.
#'   
#'   - Zebrafish (_Danio rerio_) developmental timecourse datasets (Nepal _et al._ Genome
#'   Research 2013) from \emph{ZebrafishDevelopmentalCAGE} data package, which
#'   is available for download from
#'   \href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}.
#' 
#' @param origin Character vector specifying one of the available resources for
#'        CAGE data (`"FANTOM5"`, `"FANTOM3and4"`, `"ENCODE"` or `"ZebrafishDevelopment"`).
#'        
#' @param dataset Character vector specifying one or more of the datasets
#'        available in the selected resource. For FANTOM5 it can be either
#'        `"human"` or  `"mouse"`, and only one of them can be specified at a
#'        time. For other resources please refer to the vignette of the
#'        corresponding data package for the list of available datasets.
#'        Multiple datasets mapped to the same genome can be specified to
#'        combine selected samples from each.
#'        
#' @param group Character string specifying one or more groups within specified
#'        dataset(s), from which the samples should be selected.  The `group`
#'        argument is used only when importing TSSs from data packages and
#'        ignored for "FANTOM5".  For available groups in each dataset please
#'        refer to the vignette of the corresponding data package. Either only
#'        one group has to be specified (if all selected samples belong to the
#'        same group) or one group per sample (if samples belong to different
#'        groups).  In the latter case, the number of elements in `group` must
#'        match the number of elements in `sample`.
#'        
#' @param sample Character string specifying one or more CAGE samples.  Check
#'        the corresponding data package for available samples within each group
#'        and their labels. For FANTOM5 resource, list of all human (~1000) and
#'        mouse (~) samples can be obtained in _CAGEr_ by loading
#'        `data(FANTOM5humanSamples)` and `data(FANTOM5mouseSamples)`,
#'        respectively.  Use the names from the \code{sample} column to specify
#'        which samples should be imported.
#' 
#' @return A [`CAGEexp`] object is returned, containing information on library
#' size, CTSS coordinates and tag count matrix.  The object is ready for _CAGEr_
#' analysis (normalisation, tag clustering, …).
#' 
#' @references 
#' 
#'  - Carninci _et al.,_ (2005). _The Transcriptional Landscape of the Mammalian
#'  Genome_. Science **309**(5740):1559-1563.
#'  
#'  - Djebali _et al.,_ (2012). _Landscape of transcription in human cells._
#'  Nature **488**(7414):101-108.
#'  
#'  - Faulkner _et al.,_ (2009). _The regulated retrotransposon transcriptome of
#'  mammalian cells._, Nature Genetics **41**:563-571.
#'  
#'  - Forrest _et al.,_ (2014). _A promoter-level mammalian expression atlas._
#'  Nature **507**(7493):462-470.
#'  
#'  - Nepal _et al.,_ (2013). _Dynamic regulation of the transcription
#'  initiation landscape at single nucleotide resolution during vertebrate
#'  embryogenesis_. Genome Research **23**(11):1938-1950.
#'  
#'  - Suzuki_et al.,_ (2009). The transcriptional network that controls growth
#'  arrest and differentiation in a human myeloid leukemia cell line_. Nature
#'  Genetics **41**:553-562.
#'  
#' @author Vanja Haberle
#' @author Charles Plessy
#' 
#' @family FANTOM data
#' 
#' @examples 
#' \dontrun{
#' ### importing FANTOM5 data
#' 
#' # list of FANTOM5 human tissue samples
#' 
#' data(FANTOM5humanSamples)
#' head(subset(FANTOM5humanSamples, type == "tissue"))
#' 
#' # import selected samples
#' f5 <- importPublicData(
#'   origin="FANTOM5", dataset = "human",
#'   sample = c("adipose_tissue__adult__pool1", "adrenal_gland__adult__pool1",
#'              "aorta__adult__pool1"))
#' 
#' CTSScoordinatesGR(f5)
#' 
#' ### importing FANTOM3/4 data from a data package
#' 
#' library(FANTOM3and4CAGE)
#' 
#' # list of mouse datasets available in this package
#' 
#' data(FANTOMmouseSamples)
#' unique(FANTOMmouseSamples$dataset)
#' head(subset(FANTOMmouseSamples, dataset == "FANTOMtissueCAGEmouse"))
#' head(subset(FANTOMmouseSamples, dataset == "FANTOMtimecourseCAGEmouse"))
#' 
#' # import selected samples from two different mouse datasets
#' 
#' f34 <- importPublicData(
#'   origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"),
#'   group = c("brain", "adipogenic_induction"),
#'   sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+", "DFAT-D1_preadipocytes_2days"))
#' 
#' f34 <- importPublicData(
#'   origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse"),
#'   group = c("brain"),
#'   sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+"))
#' 
#' CTSScoordinatesGR(f34)
#' 
#' }
#' 
#' @importFrom utils data
#' @export

setGeneric("importPublicData",
           function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"),
                    dataset,
                    group,
                    sample)
             standardGeneric("importPublicData"))

.df2SE <- function(df, sample, genome.name) {
  ctssRanges <- CTSS(df$chr,
                     df$pos,
                     df$strand,
                     seqinfo = seqinfo(getRefGenome(genome.name)),
                     bsgenomeName = genome.name)
  ctssDF <- lapply(df[ , sample, drop = FALSE], Rle) |> DataFrame()
  colnames(ctssDF) <- make.names(colnames(ctssDF))
  ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges)
  if (ncol(ctssDF) == 1) {
    ctssSE <- ctssSE[        df[ , sample]  > 0,]
  } else {
    ctssSE <- ctssSE[rowSums(df[ , sample]) > 0,]
  }
  sort(ctssSE)
}

.importPublicData <- function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) {
 # origin <- match.arg(origin)
	if (origin == "ENCODE") {
	  ce <- .importPublicData_ENCODE (dataset = dataset, group = group, sample = sample)
	} else if (origin == "FANTOM3and4") {
    ce <- .importPublicData_F34 (dataset = dataset, group = group, sample = sample)
  } else if (origin == "FANTOM5") {
    ce <- .importPublicData_F5 (dataset = dataset, group = group, sample = sample)
  } else if (origin == "ZebrafishDevelopment") {
    ce <- .importPublicData_ZF (group = group, sample = sample)
  }
  ce
}

.importPublicData_ENCODE <- function (dataset, group = NULL, sample = NULL) {
  stop("ENCODE data support disabled until I find a way to suggest ENCODEprojectCAGE without causing CI crash in GitHub because of WARNING.")
  # if (length(unique(dataset))>1) stop ("Merging datasets not supported yet.")
  # if (! requireNamespace("ENCODEprojectCAGE"))
  #   stop ("This function requires the ", dQuote("ENCODEprojectCAGE"), " package.",
  #         " package; please install it from http://promshift.genereg.net/CAGEr/PackageSource/.")
  # if (dataset == "ENCODEtissueCAGEfly") {
  #   if (group != "embryo") stop("Only 'embryo' is allowed as group for dataset 'ENCODEtissueCAGEfly'.")
  #   if (sample != "mixed_embryos_0-24hr") stop("Only 'mixed_embryos_0-24hr' is allowed as sample for dataset 'ENCODEtissueCAGEfly'.")
  #   if (! requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3"))
  #     stop ("This function requires the ", dQuote("BSgenome.Dmelanogaster.UCSC.dm3"), " package.")
  #   genome.name <- "BSgenome.Dmelanogaster.UCSC.dm3"
  #   ENCODEtissueCAGEfly <- NULL
  #   data("ENCODEtissueCAGEfly", package = "ENCODEprojectCAGE", envir = environment())
  #   df <- ENCODEtissueCAGEfly$embryo
  # } else {
  #   if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg19"))
  #     stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg19"), " package.")
  #   genome.name <- "BSgenome.Hsapiens.UCSC.hg19"
  #   ENCODEhumanCellLinesSamples <- NULL
  #   data("ENCODEhumanCellLinesSamples", package = "ENCODEprojectCAGE", envir = environment())
  #   info.df <- ENCODEhumanCellLinesSamples
  #   if(!(all(dataset %in% info.df$dataset)))
  #     stop("Specified dataset(s) not found! Call data(ENCODEhumanCellLinesSamples) and check 'dataset' column for available ENCODE datasets!")
  #   if(!(all(group %in% info.df[info.df$dataset == dataset,"group"])))
  #     stop("Some of the provided groups cannot be found in the specified dataset!")
  #   if(!(all(sample %in% info.df[,"sample"])))
  #     stop("Some of the provided samples cannot be found!")
  #   data(list= dataset, package = "ENCODEprojectCAGE", envir = environment())
  #   df <- get(dataset)[[group]]
  # }
  # se <- .df2SE(df, sample, genome.name)
  # ce <- CAGEexp(genomeName = genome.name,
  #               colData = DataFrame( sampleLabels = make.names(sample),
  #                                    inputFiles = NA,
  #                                    inputFilesType = 'ctss',
  #                                    librarySizes = sapply(assay(se), sum),
  #                                    row.names = make.names(sample)))
  # CTSStagCountSE(ce) <- se
  # ce
} 

.importPublicData_F34 <- function (dataset, group = NULL, sample = NULL) {
  if (length(unique(dataset))>1) stop("merging datasets not supported yet.")
  if (! requireNamespace("FANTOM3and4CAGE"))
    stop ("This function requires the ", dQuote("FANTOM3and4CAGE"), " package.")
  if (all(dataset %in% c("FANTOMtissueCAGEhuman"  ,   "FANTOMtimecourseCAGEhuman"))) {
    if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg18"))
      stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg18"), " package.")
    FANTOMhumanSamples <- FANTOMtimecourseCAGEhuman <- FANTOMtissueCAGEhuman <- NULL
      data("FANTOMhumanSamples",        package = "FANTOM3and4CAGE", envir = environment())
    data("FANTOMtimecourseCAGEhuman", package = "FANTOM3and4CAGE", envir = environment())
    data("FANTOMtissueCAGEhuman",     package = "FANTOM3and4CAGE", envir = environment())
    samples.info <- FANTOMhumanSamples
    genome.name <- "BSgenome.Hsapiens.UCSC.hg18"
  } else if (all(dataset %in% c("FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"))) {
    if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9"))
      stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.")
    FANTOMmouseSamples <- NULL
    data("FANTOMmouseSamples", package = "FANTOM3and4CAGE", envir = environment())
    data("FANTOMtimecourseCAGEmouse", package = "FANTOM3and4CAGE", envir = environment())
    data("FANTOMtissueCAGEmouse",     package = "FANTOM3and4CAGE", envir = environment())
    samples.info <- FANTOMmouseSamples
    genome.name <- "BSgenome.Mmusculus.UCSC.mm9"
  } else {
    stop("At least one dataset name is not valid")
  }
  
  validSampleNames <- samples.info$sample
  if (is.null(sample)) sample <- validSampleNames
  if ( ! all(sample %in% validSampleNames))
    stop("At least one sample name is not valid. ",
         "Call data('FANTOMhumanSamples', package='FANTOM3and4CAGE') or ",
         "data('FANTOMmouseSamples', package='FANTOM3and4CAGE') to check valid names.")
  validGroupNames <- unique(samples.info$group)
  if ( ! all(group %in% validGroupNames))
    stop("At least one group name is not valid. ",
         "Call data('FANTOMhumanSamples', package='FANTOM3and4CAGE') or ",
         "data('FANTOMmouseSamples', package='FANTOM3and4CAGE') to check valid names.")
  
  df <- get(dataset)[[group]]
  se <- .df2SE(df, sample, genome.name)
  ce <- CAGEexp(genomeName = genome.name,
                colData = DataFrame( sampleLabels = make.names(sample),
                                     inputFiles = NA,
                                     inputFilesType = 'ctss',
                                     librarySizes = sapply(assay(se), sum),
                                     row.names = make.names(sample)))
  CTSStagCountSE(ce) <- se
  ce
}

.importPublicData_F5 <- function(dataset = c("human", "mouse"), group = NULL, sample = NULL) {
  dataset <- match.arg(dataset)
  if (dataset == "human") {
    if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg19"))
      stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg19"), " package.")
    FANTOM5humanSamples <- NULL
    data("FANTOM5humanSamples", package = "CAGEr", envir = environment())
    samples.info <- FANTOM5humanSamples
    genome.name <- "BSgenome.Hsapiens.UCSC.hg19"
  } else {
    if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9"))
      stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.")
    FANTOM5mouseSamples <- NULL
    data("FANTOM5mouseSamples", package = "CAGEr", envir = environment())
    samples.info <- FANTOM5mouseSamples
    genome.name <- "BSgenome.Mmusculus.UCSC.mm9"
  }
  
  validSampleNames <- samples.info$sample
  if (is.null(sample)) sample <- validSampleNames
  if ( ! all(sample %in% validSampleNames))
    stop("At least one sample name is not valid. ",
         "Call data('FANTOM5humanSamples', package='CAGEr') or ",
         "data('FANTOM5mouseSamples', package='CAGEr') to check valid names.")
  
  ce <- CAGEexp(genomeName = genome.name,
          inputFiles = samples.info[samples.info$sample %in% sample,"data_url"],
          inputFilesType = "bedScore",
          sampleLabels = make.names(sample))
  getCTSS(ce)
}

.importPublicData_ZF <- function(group = "development", sample = NULL) {
  stop("Zebrafish data support disabled until I find a way to suggest ZebrafishDevelopmentalCAGE without causing CI crash in GitHub because of WARNING.")
  # if (group != "development")
  #   stop("Invalid group name! There is only one group in this dataset named 'development'.")
  # if (! requireNamespace("ZebrafishDevelopmentalCAGE"))
  #   stop ("This function requires the ", dQuote("ZebrafishDevelopmentalCAGE"),
  #         " package; please install it from http://promshift.genereg.net/CAGEr/PackageSource/.")
  # if (! requireNamespace("BSgenome.Drerio.UCSC.danRer7"))
  #   stop ("This function requires the ", dQuote("BSgenome.Drerio.UCSC.danRer7"), " package.")
  # 
  # ZebrafishSamples <- NULL
  # data("ZebrafishSamples", package = "ZebrafishDevelopmentalCAGE", envir = environment())
  # 
  # validSampleNames <- levels(ZebrafishSamples$sample)
  # if (is.null(sample )) sample <- validSampleNames
  # if ( ! all(sample %in% validSampleNames))
  #   stop("At least one sample name is not valid. ",
  #        "Valid sample names are: ", paste(validSampleNames, collapse = ", "), ".")
  # 
  # ZebrafishCAGE <- NULL
  # data("ZebrafishCAGE", package = "ZebrafishDevelopmentalCAGE", envir = environment())
  # # The vignette of ZebrafishDevelopmentalCAGE states that the provided coordinates are 1-based.
  # ctssSE <- .df2SE(ZebrafishCAGE$development, sample, "BSgenome.Drerio.UCSC.danRer7")
  # ce <- CAGEexp(genomeName = "BSgenome.Drerio.UCSC.danRer7",
  #               colData = DataFrame( sampleLabels = sample,
  #                                    inputFiles = NA,
  #                                    inputFilesType = 'ctss',
  #                                    librarySizes = sapply(assay(ctssSE), sum),
  #                                    row.names = sample))
  # CTSStagCountSE(ce) <- ctssSE
  # ce
}

#' @rdname importPublicData

setMethod("importPublicData", signature(origin = "character", dataset = "character", sample = "character"),
          .importPublicData)
charles-plessy/CAGEr documentation built on Aug. 2, 2024, 4:35 p.m.