#' @include CAGEr.R CAGEexp.R ClusteringMethods.R CTSS.R GetMethods.R SetMethods.R
#' @name getCTSS
#'
#' @title Reading CAGE data from input file(s) and detecting TSSs
#'
#' @description Reads input CAGE datasets into CAGEr object, constructs CAGE
#' transcriptions start sites (CTSSs) and counts number of CAGE tags supporting every
#' CTSS in each input experiment. See \code{\link{inputFilesType}} for details on
#' the supported input formats. Preprocessing and quality filtering of input CAGE
#' tags, as well as correction of CAGE-specific 'G' nucleotide addition bias can be
#' also performed before constructing TSSs.
#'
#' @param object A [`CAGEexp`] object.
#'
#' @param sequencingQualityThreshold Only CAGE tags with average sequencing quality
#' \code{>= sequencingQualityThreshold} and mapping quality \code{>=
#' mappingQualityThreshold} are kept. Used only if \code{inputFileType(object)
#' == "bam"} or \code{inputFileType(object) == "bamPairedEnd"}, \emph{i.e} when
#' input files are BAM files of aligned sequenced CAGE tags, otherwise ignored. If
#' there are no sequencing quality values in the BAM file (\emph{e.g.} HeliScope
#' single molecule sequencer does not return sequencing qualities) all reads will by
#' default have this value set to -1. Since the default value of
#' \code{sequencingQualityThreshold} is 10, all the reads will consequently be
#' discarded. To avoid this behaviour and keep all sequenced reads set
#' \code{sequencingQualityThreshold} to -1 when processing data without sequencing
#' qualities. If there is no information on mapping quality in the BAM file
#' (\emph{e.g.} software used to align CAGE tags to the referent genome does not
#' provide mapping quality) the \code{mappingQualityThreshold} parameter is ignored.
#' In case of paired-end sequencing BAM file (i.e. \code{inputFileType(object) ==
#' "bamPairedEnd"}) only the first mate of the properly paired reads (i.e. the five
#' prime end read) will be read and subject to specified thresholds.
#'
#' @param mappingQualityThreshold See sequencingQualityThreshold.
#'
#' @param removeFirstG Logical, should the first nucleotide of the CAGE tag be removed
#' in case it is a G and it does not map to the referent genome (\emph{i.e.} it is a
#' mismatch). Used only if \code{inputFileType(object) == "bam"} or
#' \code{inputFileType(object) == "bamPairedEnd"}, \emph{i.e} when input files are
#' BAM files of aligned sequenced CAGE tags, otherwise ignored. See Details.
#'
#' @param correctSystematicG Logical, should the systematic correction of the first G
#' nucleotide be performed for the positions where there is a G in the CAGE tag and G
#' in the genome. This step is performed in addition to removing the first G of the
#' CAGE tags when it is a mismatch, \emph{i.e.} this option can only be used when
#' \code{removeFirstG = TRUE}, otherwise it is ignored. The frequency of adding a G
#' to CAGE tags is estimated from mismatch cases and used to systematically correct
#' the G addition for positions with G in the genome. Used only if
#' \code{inputFileType(object) == "bam"} or \code{inputFileType(object) ==
#' "bamPairedEnd"}, \emph{i.e} when input files are BAM files of aligned sequenced
#' CAGE tags, otherwise ignored. See Details.
#'
#' @param useMulticore Logical, should multicore be used.
#' \code{useMulticore = TRUE} has no effect on non-Unix-like platforms.
#'
#' @param nrCores Number of cores to use when \code{useMulticore = TRUE}
#' (set to \code{NULL} to use all detected cores).
#'
#' @details In the CAGE experimental protocol an additional G nucleotide is often attached
#' to the 5' end of the tag by the template-free activity of the reverse transcriptase used
#' to prepare cDNA (Harbers and Carninci, Nature Methods 2005). In cases where there is a
#' G at the 5' end of the CAGE tag that does not map to the corresponding genome sequence,
#' it can confidently be considered spurious and should be removed from the tag to avoid
#' misannotating actual TSS. Thus, setting \code{removeFirstG = TRUE} is highly recommended.
#'
#' However, when there is a G both at the beginning of the CAGE tag and in the genome, it is
#' not clear whether the original CAGE tag really starts at this position or the G nucleotide
#' was added later in the experimental protocol. To systematically correct CAGE tags mapping
#' at such positions, a general frequency of adding a G to CAGE tags can be calculated from
#' mismatch cases and applied to estimate the number of CAGE tags that have G added and
#' should actually start at the next nucleotide/position. The option \code{correctSystematicG}
#' is an implementation of the correction algorithm described in Carninci \emph{et al.},
#' Nature Genetics 2006, Supplementary Information section 3-e.
#'
#' @return Returns the object, in which the `tagCountMatrix` experiment will be
#' occupied by a [`RangedSummarizedExperiment`] containing the expression data
#' as a `DataFrame` of `Rle` integers, and the CTSS coordinates as genomic
#' ranges in a [`CTSS`] object. The expression data can be retrieved with
#' the [`CTSStagCountDF`] function. In addition, the library sizes are
#' calculated and stored in the object's sample data (see [`librarySizes`]).
#'
#' @references
#'
#' Harbers and Carninci (2005) Tag-based approaches for transcriptome research and genome
#' annotation, \emph{Nature Methods} \bold{2}(7):495-502.
#'
#' Carninci \emph{et al.} (2006) Genome-wide analysis of mammalian promoter architecture and
#' evolution, \emph{Nature Genetics} \bold{38}(7):626-635.
#'
#' @author Vanja Haberle
#'
#' @seealso \code{\link{inputFilesType}}, \code{\link{librarySizes}}.
#'
#' @family CAGEr object modifiers
#'
#' @examples
#' library(BSgenome.Drerio.UCSC.danRer7)
#'
#' pathsToInputFiles <- system.file("extdata", c("Zf.unfertilized.egg.chr17.ctss",
#' "Zf.30p.dome.chr17.ctss", "Zf.prim6.rep1.chr17.ctss"), package="CAGEr")
#'
#' labels <- paste("sample", seq(1,3,1), sep = "")
#'
#' myCAGEexp <- new("CAGEexp", genomeName = "BSgenome.Drerio.UCSC.danRer7",
#' inputFiles = pathsToInputFiles, inputFilesType = "ctss", sampleLabels = labels)
#'
#' myCAGEexp <- getCTSS(myCAGEexp)
#'
#' @docType methods
#'
#' @importFrom S4Vectors DataFrame
#' @importFrom S4Vectors Rle
#' @export
setGeneric( "getCTSS"
, function( object
, sequencingQualityThreshold = 10, mappingQualityThreshold = 20
, removeFirstG = TRUE, correctSystematicG = TRUE
, useMulticore = FALSE, nrCores = NULL)
standardGeneric("getCTSS"))
checkFilesExist <- function(paths) {
for (f in paths)
if (isFALSE(grepl("^http", f)))
if (! file.exists(f)) stop("Could not locate input file ", f)
}
#' toCTSSdt
#'
#' Non-exported private helper function
#'
#' @return Returns a \code{data.table} with TSS indicated in the \dQuote{chr},
#' \dQuote{pos} and \dQuote{strand} columns, and a score column named with
#' the sample label.
#'
#' @noRd
#' @importFrom data.table data.table
#' @importFrom data.table setkeyv
#' @importFrom data.table setnames
toCTSSdt <- function(reads.GRanges, sample.label) {
reads.GRanges.plus <- reads.GRanges[strand(reads.GRanges) == "+"]
reads.GRanges.minus <- reads.GRanges[strand(reads.GRanges) == "-"]
gr2ctssdf <- function(gr, strand)
data.frame( chr = as.character(seqnames(gr))
, pos = if (strand == "+") start(gr) else end(gr)
, strand = strand
, stringsAsFactors = F)
CTSS.plus <- gr2ctssdf(reads.GRanges.plus, "+")
CTSS.minus <-gr2ctssdf(reads.GRanges.minus, "-")
CTSS <- rbind(CTSS.plus, CTSS.minus)
CTSS$tag_count <- 1L
CTSS <- .byCtss(data.table(CTSS), "tag_count", sum)
setnames(CTSS, c("chr", "pos", "strand", sample.label))
setkeyv(CTSS, c("chr", "pos", "strand"))
CTSS
}
addCTSScolumn <- function(CTSS.all.samples, CTSS) {
if(is.null(CTSS.all.samples))
return(CTSS)
merge(CTSS.all.samples, CTSS, all.x = TRUE, all.y = TRUE)
}
#' coerceInBSgenome
#'
#' A private (non-exported) function to discard any range that is
#' not compatible with the CAGEr object's BSgenome.
#'
#' @param gr The genomic ranges to coerce.
#' @param genome The name of a BSgenome package, which must me installed,
#' or \code{NULL} to skip coercion.
#'
#' @return A GRanges object in which every range is guaranteed to be compatible
#' with the given BSgenome object. The sequnames of the GRanges are also set
#' accordingly to the BSgenome.
#'
#' @importFrom GenomeInfoDb seqinfo
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom GenomeInfoDb seqlevels seqlevels<-
#' @importFrom GenomeInfoDb seqnames
#' @importFrom S4Vectors %in%
coerceInBSgenome <- function(gr, genome) {
if (is.null(genome)) return(gr)
genome <- getRefGenome(genome)
gr <- gr[seqnames(gr) %in% seqnames(genome)]
gr <- gr[! end(gr) > seqlengths(genome)[as.character(seqnames(gr))]]
seqlevels(gr) <- seqlevels(genome)
seqinfo(gr) <- seqinfo(genome)
gr
}
#' loadFileIntoGPos
#'
#' A private (non-exported) function to load from each file format supported by CAGEr
#'
#' @param filepath The path to the file to load.
#' @param filetype The type of the file
#' @param sequencingQualityThreshold See getCTSS().
#' @param mappingQualityThreshold See getCTSS().
#' @param removeFirstG See getCTSS().
#' @param correctSystematicG See getCTSS().
#' @param genome See coerceInBSgenome().
#'
#' @return A [GPos()] object where the score represents the number of CAGE tags
#' starting on that nucleotide.
#'
#' @seealso import.CTSS
#' @family loadFileIntoGPos
loadFileIntoGPos <- function( filepath
, filetype = c( "bam", "bamPairedEnd"
, "bed", "bedctss", "bedScore"
, "CAGEscanMolecule", "ctss")
, sequencingQualityThreshold
, mappingQualityThreshold
, removeFirstG
, correctSystematicG
, genome) {
if (missing(filetype)) stop("Please specify the file type.")
filetype <- match.arg(filetype)
switch( filetype
, bam = import.bam.ctss( filepath = filepath
, filetype = "bam"
, sequencingQualityThreshold = sequencingQualityThreshold
, mappingQualityThreshold = mappingQualityThreshold
, removeFirstG = removeFirstG
, correctSystematicG = correctSystematicG
, genome = genome)
, bamPairedEnd = import.bam.ctss( filepath = filepath
, filetype = "bamPairedEnd"
, sequencingQualityThreshold = sequencingQualityThreshold
, mappingQualityThreshold = mappingQualityThreshold
, removeFirstG = removeFirstG
, correctSystematicG = correctSystematicG
, genome = genome)
, bed = import.bedmolecule(filepath)
, bedScore = import.bedScore(filepath)
, bedctss = import.bedCTSS(filepath)
, CAGEscanMolecule = import.CAGEscanMolecule(filepath)
, ctss = import.CTSS(filepath))
}
#' moleculesGR2CTSS
#'
#' Calculates CTSS positions from a GenomicRanges object where each element
#' represents a single molecule.
#'
#' @param gr A [GRanges] object.
#'
#' @return Returns a [GRanges] object.
#'
#' @family loadFileIntoGPos
#'
#' @importFrom S4Vectors Rle
#'
#' @examples
#' gr <- GenomicRanges::GRanges("chr1", IRanges::IRanges(1, 10), c("+", "-", "+"))
#' CAGEr:::moleculesGR2CTSS(gr)
moleculesGR2CTSS <- function(gr) {
tb <- table(promoters(gr, 0, 1))
gp <- as(names(tb), "UnstitchedGPos")
score(gp) <- Rle(as.integer(tb))
gp
}
#' import.bam
#'
#' Imports CTSS data from a BAM file.
#'
#' @param filepath The path to the BAM file.
#' @param filetype bam or bamPairedEnd.
#' @param sequencingQualityThreshold See getCTSS().
#' @param mappingQualityThreshold See getCTSS().
#'
#' @family loadFileIntoGPos
#'
#' @importFrom Rsamtools bamFlag<-
#' @importFrom Rsamtools ScanBamParam
#' @importFrom Rsamtools scanBamFlag
#' @importFrom GenomicAlignments readGAlignments
#' @importFrom GenomicAlignments qwidth
#'
#' @examples
#' # TODO: add exmaple file
#' # import.bam(system.file("extdata", "example.bam", package = "CAGEr"))
import.bam <- function( filepath
, filetype
, sequencingQualityThreshold = 10
, mappingQualityThreshold = 20) {
param <- ScanBamParam( what = c("seq", "qual")
, flag = scanBamFlag(isUnmappedQuery = FALSE)
, mapqFilter = mappingQualityThreshold)
if (filetype == "bamPairedEnd")
bamFlag(param) <- scanBamFlag( isUnmappedQuery = FALSE
, isProperPair = TRUE
, isFirstMateRead = TRUE)
bam <- readGAlignments(filepath, param = param)
message("\t-> Filtering out low quality reads...")
# We need to loop on qualities because there is a hard limit on the length.
# See <https://support.bioconductor.org/p/97993/>.
qual <- mcols(bam)$qual
start <- 1
chunksize <- 1e6
qual.avg <- vector(mode = "integer")
repeat {
if (start + chunksize <= length(qual)) {
end <- start + chunksize
} else {
end <- length(qual)
}
qual.avg <- c(qual.avg, as.integer(mean(as(qual[start:end], "IntegerList"))))
if (end == length(qual)) {
break
} else {
start <- end + 1
}
}
bam <- bam[qual.avg >= sequencingQualityThreshold]
gr <- GRanges(bam)
gr$read.length <- qwidth(bam)
gr
}
#' bam2CTSS
#'
#' Converts from BAM to CTSS
#'
#' Converts genomic ranges representing SAM/BAM alignments into a CTSS object.
#'
#' @param gr A \code{\link{GRanges}} object returned by \code{\link{import.bam}}.
#' @param removeFirstG See getCTSS().
#' @param correctSystematicG See getCTSS().
#' @param genome See coerceInBSgenome().
#'
#' @return Returns a \code{\link{CTSS}} object.
#'
#' @importFrom GenomeInfoDb bsgenomeName
#' @family loadFileIntoGPos
bam2CTSS <- function(gr, removeFirstG, correctSystematicG, genome) {
gr <- coerceInBSgenome(gr, genome)
if(removeFirstG == TRUE) {
message("\t-> Removing the first base of the reads if 'G' and not aligned to the genome...")
if (isFALSE(correctSystematicG)) {
gp <- .remove.added.G.CTSS(gr, genome, correctSystematicG = FALSE)
} else {
# Using the old CAGEset function that returns a data.table
gp <- .remove.added.G(gr, genome, correctSystematicG = TRUE, "score")
return(gp)
}
} else {
gp <- CTSS(promoters(gr, 0, 1))
}
tb <- table(gp)
gp <- CTSS(names(tb), bsgenomeName = genome)
score(gp) <- Rle(unclass(tb))
gp
}
#' import.bam.ctss
#'
#' Imports CTSS data from a BAM file.
#'
#' @param filepath The path to the BAM file.
#' @param filetype bam or bamPairedEnd.
#' @param sequencingQualityThreshold See getCTSS().
#' @param mappingQualityThreshold See getCTSS().
#' @param removeFirstG See getCTSS().
#' @param correctSystematicG See getCTSS().
#' @param genome See coerceInBSgenome().
#'
#' @return Returns a [CTSS] object.
#'
#' @family loadFileIntoGPos
import.bam.ctss <- function( filepath, filetype, sequencingQualityThreshold
, mappingQualityThreshold, removeFirstG
, correctSystematicG, genome) {
bam <- import.bam( filepath = filepath
, filetype = filetype
, sequencingQualityThreshold = sequencingQualityThreshold
, mappingQualityThreshold = mappingQualityThreshold)
bam2CTSS( gr= bam
, removeFirstG = removeFirstG
, correctSystematicG = correctSystematicG
, genome = genome)
}
#' import.bedmolecule
#'
#' Imports a BED file where each line counts for one molecule in a
#' GRanges object where each line represents one nucleotide.
#'
#' @param filepath The path to the BED file.
#'
#' @return Returns a \code{\link{CTSS}} object.
#'
#' @family loadFileIntoGPos
#'
#' @importFrom rtracklayer import.bed
#' @importFrom S4Vectors Rle
#'
#' @examples
#' # TODO: add exmaple file
#' # import.BED(system.file("extdata", "example.bed", package = "CAGEr"))
import.bedmolecule <- function(filepath) {
moleculesGR2CTSS(rtracklayer::import.bed(filepath))
}
#' import.bedScore
#'
#' Imports a BED file where the score indicates a number of counts
#' for a given alignment.
#'
#' @param filepath The path to the BED file.
#'
#' @return A GRanges object where each line represents one nucleotide.
#'
#' @family loadFileIntoGPos
#'
#' @importFrom rtracklayer import.bed
#' @importFrom S4Vectors Rle
#'
#' @examples
#' # TODO: add exmaple file
#' # import.bedScore(system.file("extdata", "example.bed", package = "CAGEr"))
import.bedScore <- function(filepath) {
gr <- rtracklayer::import.bed(filepath)
df <- data.frame( p = as.character(promoters(gr, 0, 1))
, s = score(gr))
df <- rowsum(df$s, df$p)
gp <- as(rownames(df), "UnstitchedGPos")
score(gp) <- Rle(as.integer(df[,1]))
gp
}
#' import.bedCTSS
#'
#' Imports a BED file where each line represents a single base, with a score
#' counting the number of CAGE transcription start sites (CTSS).
#'
#' @return A GRanges object where each line represents one nucleotide.
#'
#' @param filepath The path to the BED file.
#'
#' @family loadFileIntoGPos
#'
#' @importFrom rtracklayer import.bed
#' @importFrom GenomeInfoDb sortSeqlevels
#' @importFrom GenomicRanges countOverlaps
#'
#' @examples
#' # TODO: add exmaple file
#' # import.BED(system.file("extdata", "example.bed", package = "CAGEr"))
import.bedCTSS <- function(filepath) {
gr <- rtracklayer::import.bed(filepath)
if (length(unique(gr)) != length(gr))
stop("Input must not contain the same range multiple times.")
if (any(countOverlaps(gr) != 1))
stop("Input must not contain overlapping ranges.")
seqlevels(gr) <- sortSeqlevels(seqlevels(gr))
mcols(gr) <- DataFrame(score = Rle(as.integer(score(gr))))
as(gr, "UnstitchedGPos")
}
#' import.CTSS
#'
#' Imports a "CTSS" file in a [GPos] object
#'
#' @param filepath The path to the "CTSS" file.
#'
#' Note that the format of the "CTSS" files handled in this function is not
#' the same as the FANTOM5 "CTSS" files (which are plain BED).
#'
#' @family loadFileIntoGPos
#'
#' @importFrom utils read.table
#'
#' @examples
#' CAGEr:::import.CTSS(system.file("extdata", "Zf.high.chr17.ctss", package = "CAGEr"))
import.CTSS <- function(filepath) {
CTSS <- read.table( file = filepath
, header = F
, sep = "\t"
, col.names = c("chr", "pos", "strand", "score")
, colClasses = c("character", "integer", "character", "integer"))
gp <- GPos( stitch = FALSE
, GRanges( seqnames = CTSS$chr
, ranges = IRanges(CTSS$pos, width = 1)
, strand = CTSS$strand))
score(gp) <- CTSS$score
gp
}
#' parseCAGEscanBlocksToGrangeTSS
#'
#' Parse a string describing a block in a CAGEscan molecule, as output by
#' the "CAGEscan 3.0" pipeline.
#'
#' @param blocks A character string representing a block in a CAGEscan molecule.
#'
#' @return A GRanges object representing a TSS.
#'
#' In CAGEscan molecules, blocks are separated by \sQuote{|}, \sQuote{,} or
#' \sQuote{;} for gap of coverage, splice junction (confident) and splice
#' junction (maybe) respectively. Strand is "+" if first coordinate is lower
#' than the second one, and "-" otherwise.
#'
#' @seealso import.CAGEscanMolecule
#'
#' @examples
#' myMolecule <- paste0( "chr11:66268633-66268693,"
#' , "chr11:66271796-66271869;"
#' , "chr11:66272156-66272252|"
#' , "chr11:66272364-66272460")
#' myFirstBlock <- sub("[,;|].*", "", myMolecule)
#'
#' CAGEr:::parseCAGEscanBlocksToGrangeTSS(myFirstBlock)
parseCAGEscanBlocksToGrangeTSS <- function (blocks) {
blocks <- strsplit(blocks, "[:-]")
chr <- unlist(lapply(blocks, `[[`, 1))
fst <- as.integer(unlist(lapply(blocks, `[[`, 2)))
snd <- as.integer(unlist(lapply(blocks, `[[`, 3)))
strand <- ifelse(fst < snd, "+", "-")
start <- pmin(fst, snd)
GPos(stitch=FALSE, GRanges(chr, IRanges(start, width = 1), strand))
}
#' import.CAGEscanMolecule
#'
#' Imports a CAGEscan \dQuote{molecule} file in a [`GRanges`] object
#'
#' @param filepath The path to the \dQuote{molecule} file.
#'
#' @seealso parseCAGEscanBlocksToGrangeTSS
#'
#' @importFrom GenomicRanges GRangesList
#'
#' @examples
#' # TODO import.CAGEscanMolecule(system.file("extdata", "example.molecule.txt", package = "CAGEr"))
#'
#' @importFrom data.table fread
import.CAGEscanMolecule <- function(filepath) {
molecules <- unname(unlist(data.table::fread(select = 9, paste( "grep -v \\#", filepath))))
molecules <- sub("[,;|].*", "", molecules)
parseCAGEscanBlocksToGrangeTSS(molecules)
}
#' @rdname getCTSS
setMethod( "getCTSS", "CAGEexp"
, function ( object
, sequencingQualityThreshold, mappingQualityThreshold
, removeFirstG, correctSystematicG
, useMulticore, nrCores) {
if(isFALSE(removeFirstG) & isTRUE(correctSystematicG))
stop("correctSystematicG can not be TRUE while removeFirstG is FALSE")
# Step 0: Test existance of each file before spending time loading them.
checkFilesExist(inputFiles(object))
# Step 1: Load each file as GRangesList where each GRange is a CTSS data.
l <- as.list(seq_along(inputFiles(object)))
l <- bplapply(l, function(i) {
message("\nReading in file: ", inputFiles(object)[i], "...")
gr <- loadFileIntoGPos( filepath = inputFiles(object)[i]
, filetype = inputFilesType(object)[i]
, sequencingQualityThreshold = sequencingQualityThreshold
, mappingQualityThreshold = mappingQualityThreshold
, removeFirstG = removeFirstG
, correctSystematicG = correctSystematicG
, genome = genomeName(object))
gr <- coerceInBSgenome(gr, genomeName(object))
l[[i]] <- sort(gr)
}, BPPARAM = CAGEr_Multicore(useMulticore, nrCores))
l <- GRangesList(l)
# Step 2: Create GPos representing all the nucleotides with CAGE counts in the list.
rowRanges <- sort(unique(unlist(l)))
mcols(rowRanges) <- NULL
# Step 3: Fold the GRangesList in a expression DataFrame of Rle-encoded counts.
assay <- DataFrame(V1 = Rle(rep(0L, length(rowRanges))))
expandRange <- function(global, local) {
x <- Rle(rep(0L, length(global)))
x[global %in% local] <- score(local)
x
}
for (i in seq_along(l))
assay[,i] <- expandRange(rowRanges, l[[i]])
rowRanges <- new("CTSS", rowRanges, bsgenomeName = genomeName(object))
colnames(assay) <- sampleLabels(object)
# Setp 4: Put the data in the appropriate slot of the MultiAssayExperiment.
CTSStagCountSE(object) <-
SummarizedExperiment( rowRanges = rowRanges
, assays = SimpleList(counts = assay))
# Step 5: update the sample metadata (colData).
librarySizes(object) <- unlist(lapply(CTSStagCountDF(object), sum))
# Setp 6: Return the modified object.
object
})
#' FANTOM5 human samples
#'
#' Lazy-loaded `data.frame` object, containing information about FANTOM5
#' libraries. Its use is described in more details in the vignette
#' \dQuote{Use of CAGE resources with CAGEr}.
#'
#' @format A `data.frame` with `sample`, `type`, `description`, `library_id`
#' and `data_url` columns.
#'
#' @family FANTOM data
"FANTOM5humanSamples"
#' FANTOM5 mouse samples
#'
#' Lazy-loaded `data.frame` object, containing information about FANTOM5
#' libraries. Its use is described in more details in the vignette
#' \dQuote{Use of CAGE resources with CAGEr}.
#'
#' @format A `data.frame` with `sample`, `type`, `description`, `library_id`
#' and `data_url` columns.
#'
#' @family FANTOM data
"FANTOM5mouseSamples"
#' importPublicData
#'
#' Imports CAGE data from different sources into a [`CAGEexp`] object. After
#' the object has been created the data can be further manipulated and
#' visualized using other functions available in the _CAGEr_ package and
#' integrated with other analyses in R. Available resources include:
#'
#' - FANTOM5 datasets (Forrest _et al.,_ Nature 2014) for numerous human and
#' mouse samples (primary cells, cell lines and tissues), which are fetched
#' directly from FANTOM5 online resource at
#' \href{https://fantom.gsc.riken.jp/5/data}{https://fantom.gsc.riken.jp/5/data}.
#'
#' - FANTOM3 and 4 datasets (Carninci _et al., _ Science 2005, Faulkner
#' _et al.,_ Nature Genetics 2009, Suzuki _et al._ Nature Genetics 2009) from
#' _FANTOM3and4CAGE_ data package available from Bioconductor.
#'
#' - ENCODE datasets (Djebali _et al._ Nature 2012) for numerous human cell
#' lines from _ENCODEprojectCAGE_ data package, which is available for
#' download from \href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}.
#'
#' - Zebrafish (_Danio rerio_) developmental timecourse datasets (Nepal _et al._ Genome
#' Research 2013) from \emph{ZebrafishDevelopmentalCAGE} data package, which
#' is available for download from
#' \href{http://promshift.genereg.net/CAGEr/}{http://promshift.genereg.net/CAGEr/}.
#'
#' @param origin Character vector specifying one of the available resources for
#' CAGE data (`"FANTOM5"`, `"FANTOM3and4"`, `"ENCODE"` or `"ZebrafishDevelopment"`).
#'
#' @param dataset Character vector specifying one or more of the datasets
#' available in the selected resource. For FANTOM5 it can be either
#' `"human"` or `"mouse"`, and only one of them can be specified at a
#' time. For other resources please refer to the vignette of the
#' corresponding data package for the list of available datasets.
#' Multiple datasets mapped to the same genome can be specified to
#' combine selected samples from each.
#'
#' @param group Character string specifying one or more groups within specified
#' dataset(s), from which the samples should be selected. The `group`
#' argument is used only when importing TSSs from data packages and
#' ignored for "FANTOM5". For available groups in each dataset please
#' refer to the vignette of the corresponding data package. Either only
#' one group has to be specified (if all selected samples belong to the
#' same group) or one group per sample (if samples belong to different
#' groups). In the latter case, the number of elements in `group` must
#' match the number of elements in `sample`.
#'
#' @param sample Character string specifying one or more CAGE samples. Check
#' the corresponding data package for available samples within each group
#' and their labels. For FANTOM5 resource, list of all human (~1000) and
#' mouse (~) samples can be obtained in _CAGEr_ by loading
#' `data(FANTOM5humanSamples)` and `data(FANTOM5mouseSamples)`,
#' respectively. Use the names from the \code{sample} column to specify
#' which samples should be imported.
#'
#' @return A [`CAGEexp`] object is returned, containing information on library
#' size, CTSS coordinates and tag count matrix. The object is ready for _CAGEr_
#' analysis (normalisation, tag clustering, …).
#'
#' @references
#'
#' - Carninci _et al.,_ (2005). _The Transcriptional Landscape of the Mammalian
#' Genome_. Science **309**(5740):1559-1563.
#'
#' - Djebali _et al.,_ (2012). _Landscape of transcription in human cells._
#' Nature **488**(7414):101-108.
#'
#' - Faulkner _et al.,_ (2009). _The regulated retrotransposon transcriptome of
#' mammalian cells._, Nature Genetics **41**:563-571.
#'
#' - Forrest _et al.,_ (2014). _A promoter-level mammalian expression atlas._
#' Nature **507**(7493):462-470.
#'
#' - Nepal _et al.,_ (2013). _Dynamic regulation of the transcription
#' initiation landscape at single nucleotide resolution during vertebrate
#' embryogenesis_. Genome Research **23**(11):1938-1950.
#'
#' - Suzuki_et al.,_ (2009). The transcriptional network that controls growth
#' arrest and differentiation in a human myeloid leukemia cell line_. Nature
#' Genetics **41**:553-562.
#'
#' @author Vanja Haberle
#' @author Charles Plessy
#'
#' @family FANTOM data
#'
#' @examples
#' \dontrun{
#' ### importing FANTOM5 data
#'
#' # list of FANTOM5 human tissue samples
#'
#' data(FANTOM5humanSamples)
#' head(subset(FANTOM5humanSamples, type == "tissue"))
#'
#' # import selected samples
#' f5 <- importPublicData(
#' origin="FANTOM5", dataset = "human",
#' sample = c("adipose_tissue__adult__pool1", "adrenal_gland__adult__pool1",
#' "aorta__adult__pool1"))
#'
#' CTSScoordinatesGR(f5)
#'
#' ### importing FANTOM3/4 data from a data package
#'
#' library(FANTOM3and4CAGE)
#'
#' # list of mouse datasets available in this package
#'
#' data(FANTOMmouseSamples)
#' unique(FANTOMmouseSamples$dataset)
#' head(subset(FANTOMmouseSamples, dataset == "FANTOMtissueCAGEmouse"))
#' head(subset(FANTOMmouseSamples, dataset == "FANTOMtimecourseCAGEmouse"))
#'
#' # import selected samples from two different mouse datasets
#'
#' f34 <- importPublicData(
#' origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"),
#' group = c("brain", "adipogenic_induction"),
#' sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+", "DFAT-D1_preadipocytes_2days"))
#'
#' f34 <- importPublicData(
#' origin="FANTOM3and4", dataset = c("FANTOMtissueCAGEmouse"),
#' group = c("brain"),
#' sample = c("CCL-131_Neuro-2a_treatment_for_6hr_with_MPP+"))
#'
#' CTSScoordinatesGR(f34)
#'
#' }
#'
#' @importFrom utils data
#' @export
setGeneric("importPublicData",
function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"),
dataset,
group,
sample)
standardGeneric("importPublicData"))
.df2SE <- function(df, sample, genome.name) {
ctssRanges <- CTSS(df$chr,
df$pos,
df$strand,
seqinfo = seqinfo(getRefGenome(genome.name)),
bsgenomeName = genome.name)
ctssDF <- lapply(df[ , sample, drop = FALSE], Rle) |> DataFrame()
colnames(ctssDF) <- make.names(colnames(ctssDF))
ctssSE <- SummarizedExperiment(c(counts = ctssDF), ctssRanges)
if (ncol(ctssDF) == 1) {
ctssSE <- ctssSE[ df[ , sample] > 0,]
} else {
ctssSE <- ctssSE[rowSums(df[ , sample]) > 0,]
}
sort(ctssSE)
}
.importPublicData <- function(origin = c("FANTOM5", "FANTOM3and4", "ENCODE", "ZebrafishDevelopment"), dataset, group, sample) {
# origin <- match.arg(origin)
if (origin == "ENCODE") {
ce <- .importPublicData_ENCODE (dataset = dataset, group = group, sample = sample)
} else if (origin == "FANTOM3and4") {
ce <- .importPublicData_F34 (dataset = dataset, group = group, sample = sample)
} else if (origin == "FANTOM5") {
ce <- .importPublicData_F5 (dataset = dataset, group = group, sample = sample)
} else if (origin == "ZebrafishDevelopment") {
ce <- .importPublicData_ZF (group = group, sample = sample)
}
ce
}
.importPublicData_ENCODE <- function (dataset, group = NULL, sample = NULL) {
stop("ENCODE data support disabled until I find a way to suggest ENCODEprojectCAGE without causing CI crash in GitHub because of WARNING.")
# if (length(unique(dataset))>1) stop ("Merging datasets not supported yet.")
# if (! requireNamespace("ENCODEprojectCAGE"))
# stop ("This function requires the ", dQuote("ENCODEprojectCAGE"), " package.",
# " package; please install it from http://promshift.genereg.net/CAGEr/PackageSource/.")
# if (dataset == "ENCODEtissueCAGEfly") {
# if (group != "embryo") stop("Only 'embryo' is allowed as group for dataset 'ENCODEtissueCAGEfly'.")
# if (sample != "mixed_embryos_0-24hr") stop("Only 'mixed_embryos_0-24hr' is allowed as sample for dataset 'ENCODEtissueCAGEfly'.")
# if (! requireNamespace("BSgenome.Dmelanogaster.UCSC.dm3"))
# stop ("This function requires the ", dQuote("BSgenome.Dmelanogaster.UCSC.dm3"), " package.")
# genome.name <- "BSgenome.Dmelanogaster.UCSC.dm3"
# ENCODEtissueCAGEfly <- NULL
# data("ENCODEtissueCAGEfly", package = "ENCODEprojectCAGE", envir = environment())
# df <- ENCODEtissueCAGEfly$embryo
# } else {
# if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg19"))
# stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg19"), " package.")
# genome.name <- "BSgenome.Hsapiens.UCSC.hg19"
# ENCODEhumanCellLinesSamples <- NULL
# data("ENCODEhumanCellLinesSamples", package = "ENCODEprojectCAGE", envir = environment())
# info.df <- ENCODEhumanCellLinesSamples
# if(!(all(dataset %in% info.df$dataset)))
# stop("Specified dataset(s) not found! Call data(ENCODEhumanCellLinesSamples) and check 'dataset' column for available ENCODE datasets!")
# if(!(all(group %in% info.df[info.df$dataset == dataset,"group"])))
# stop("Some of the provided groups cannot be found in the specified dataset!")
# if(!(all(sample %in% info.df[,"sample"])))
# stop("Some of the provided samples cannot be found!")
# data(list= dataset, package = "ENCODEprojectCAGE", envir = environment())
# df <- get(dataset)[[group]]
# }
# se <- .df2SE(df, sample, genome.name)
# ce <- CAGEexp(genomeName = genome.name,
# colData = DataFrame( sampleLabels = make.names(sample),
# inputFiles = NA,
# inputFilesType = 'ctss',
# librarySizes = sapply(assay(se), sum),
# row.names = make.names(sample)))
# CTSStagCountSE(ce) <- se
# ce
}
.importPublicData_F34 <- function (dataset, group = NULL, sample = NULL) {
if (length(unique(dataset))>1) stop("merging datasets not supported yet.")
if (! requireNamespace("FANTOM3and4CAGE"))
stop ("This function requires the ", dQuote("FANTOM3and4CAGE"), " package.")
if (all(dataset %in% c("FANTOMtissueCAGEhuman" , "FANTOMtimecourseCAGEhuman"))) {
if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg18"))
stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg18"), " package.")
FANTOMhumanSamples <- FANTOMtimecourseCAGEhuman <- FANTOMtissueCAGEhuman <- NULL
data("FANTOMhumanSamples", package = "FANTOM3and4CAGE", envir = environment())
data("FANTOMtimecourseCAGEhuman", package = "FANTOM3and4CAGE", envir = environment())
data("FANTOMtissueCAGEhuman", package = "FANTOM3and4CAGE", envir = environment())
samples.info <- FANTOMhumanSamples
genome.name <- "BSgenome.Hsapiens.UCSC.hg18"
} else if (all(dataset %in% c("FANTOMtissueCAGEmouse", "FANTOMtimecourseCAGEmouse"))) {
if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9"))
stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.")
FANTOMmouseSamples <- NULL
data("FANTOMmouseSamples", package = "FANTOM3and4CAGE", envir = environment())
data("FANTOMtimecourseCAGEmouse", package = "FANTOM3and4CAGE", envir = environment())
data("FANTOMtissueCAGEmouse", package = "FANTOM3and4CAGE", envir = environment())
samples.info <- FANTOMmouseSamples
genome.name <- "BSgenome.Mmusculus.UCSC.mm9"
} else {
stop("At least one dataset name is not valid")
}
validSampleNames <- samples.info$sample
if (is.null(sample)) sample <- validSampleNames
if ( ! all(sample %in% validSampleNames))
stop("At least one sample name is not valid. ",
"Call data('FANTOMhumanSamples', package='FANTOM3and4CAGE') or ",
"data('FANTOMmouseSamples', package='FANTOM3and4CAGE') to check valid names.")
validGroupNames <- unique(samples.info$group)
if ( ! all(group %in% validGroupNames))
stop("At least one group name is not valid. ",
"Call data('FANTOMhumanSamples', package='FANTOM3and4CAGE') or ",
"data('FANTOMmouseSamples', package='FANTOM3and4CAGE') to check valid names.")
df <- get(dataset)[[group]]
se <- .df2SE(df, sample, genome.name)
ce <- CAGEexp(genomeName = genome.name,
colData = DataFrame( sampleLabels = make.names(sample),
inputFiles = NA,
inputFilesType = 'ctss',
librarySizes = sapply(assay(se), sum),
row.names = make.names(sample)))
CTSStagCountSE(ce) <- se
ce
}
.importPublicData_F5 <- function(dataset = c("human", "mouse"), group = NULL, sample = NULL) {
dataset <- match.arg(dataset)
if (dataset == "human") {
if (! requireNamespace("BSgenome.Hsapiens.UCSC.hg19"))
stop ("This function requires the ", dQuote("BSgenome.Hsapiens.UCSC.hg19"), " package.")
FANTOM5humanSamples <- NULL
data("FANTOM5humanSamples", package = "CAGEr", envir = environment())
samples.info <- FANTOM5humanSamples
genome.name <- "BSgenome.Hsapiens.UCSC.hg19"
} else {
if (! requireNamespace("BSgenome.Mmusculus.UCSC.mm9"))
stop ("This function requires the ", dQuote("BSgenome.Mmusculus.UCSC.mm9"), " package.")
FANTOM5mouseSamples <- NULL
data("FANTOM5mouseSamples", package = "CAGEr", envir = environment())
samples.info <- FANTOM5mouseSamples
genome.name <- "BSgenome.Mmusculus.UCSC.mm9"
}
validSampleNames <- samples.info$sample
if (is.null(sample)) sample <- validSampleNames
if ( ! all(sample %in% validSampleNames))
stop("At least one sample name is not valid. ",
"Call data('FANTOM5humanSamples', package='CAGEr') or ",
"data('FANTOM5mouseSamples', package='CAGEr') to check valid names.")
ce <- CAGEexp(genomeName = genome.name,
inputFiles = samples.info[samples.info$sample %in% sample,"data_url"],
inputFilesType = "bedScore",
sampleLabels = make.names(sample))
getCTSS(ce)
}
.importPublicData_ZF <- function(group = "development", sample = NULL) {
stop("Zebrafish data support disabled until I find a way to suggest ZebrafishDevelopmentalCAGE without causing CI crash in GitHub because of WARNING.")
# if (group != "development")
# stop("Invalid group name! There is only one group in this dataset named 'development'.")
# if (! requireNamespace("ZebrafishDevelopmentalCAGE"))
# stop ("This function requires the ", dQuote("ZebrafishDevelopmentalCAGE"),
# " package; please install it from http://promshift.genereg.net/CAGEr/PackageSource/.")
# if (! requireNamespace("BSgenome.Drerio.UCSC.danRer7"))
# stop ("This function requires the ", dQuote("BSgenome.Drerio.UCSC.danRer7"), " package.")
#
# ZebrafishSamples <- NULL
# data("ZebrafishSamples", package = "ZebrafishDevelopmentalCAGE", envir = environment())
#
# validSampleNames <- levels(ZebrafishSamples$sample)
# if (is.null(sample )) sample <- validSampleNames
# if ( ! all(sample %in% validSampleNames))
# stop("At least one sample name is not valid. ",
# "Valid sample names are: ", paste(validSampleNames, collapse = ", "), ".")
#
# ZebrafishCAGE <- NULL
# data("ZebrafishCAGE", package = "ZebrafishDevelopmentalCAGE", envir = environment())
# # The vignette of ZebrafishDevelopmentalCAGE states that the provided coordinates are 1-based.
# ctssSE <- .df2SE(ZebrafishCAGE$development, sample, "BSgenome.Drerio.UCSC.danRer7")
# ce <- CAGEexp(genomeName = "BSgenome.Drerio.UCSC.danRer7",
# colData = DataFrame( sampleLabels = sample,
# inputFiles = NA,
# inputFilesType = 'ctss',
# librarySizes = sapply(assay(ctssSE), sum),
# row.names = sample))
# CTSStagCountSE(ce) <- ctssSE
# ce
}
#' @rdname importPublicData
setMethod("importPublicData", signature(origin = "character", dataset = "character", sample = "character"),
.importPublicData)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.