#' Process reads from High-Troughtput Sequencing experiments
#'
#' This method allows the processment of NGS nucleosome reads from different
#' sources and a basic manipulation of them. The tasks includes the correction
#' of strand-specific single-end reads and the trimming of reads to a given
#' length.
#'
#' This function reads a [ShortRead::AlignedRead] or a [GenomicRanges::GRanges]
#' object containing the position, length and strand of the sequence reads.
#'
#' It allows the processment of both paired and single ended reads. In the case
#' of single end reads this function corrects the strand-specific mapping by
#' shifting plus strand reads and minus strand reads towards a middle position
#' where both strands are overlaped. This is done by accounting the expected
#' fragment length (`fragmentLen`).
#'
#' For paired end reads, mononucleosomal reads could extend more than expected
#' length due to mapping issues or experimental conditions. In this case, the
#' `fragmentLen` variable sets the threshold from which reads longer than it
#' should be ignored.
#'
#' If no value is supplied for `fragmentLen` it will be calculated
#' automatically (increasing the computing time) using `fragmentLenDetect` with
#' default parameters. Performance can be increased by tunning
#' `fragmentLenDetect` parameteres in a separated call and passing its result
#' as `fragmentLen` parameter.
#'
#' In some cases, could be useful trim the reads to a shorter length to improve
#' the detection of nucleosome dyads, easing its detection and automatic
#' positioning. The parameter `trim` allows the selection of how many
#' nucleotides select from each read.
#'
#' A special case for single-ended data is setting the `trim` to the same
#' value as `fragmentLen`, so the reads will be extended strand-wise towards
#' the 3' direction, creating an artificial map comparable with
#' paired-ended data. The same but opposite can be performed with paired-end
#' data, setting a `trim` value equal to the read length from paired ended, so
#' paired-ended data will look like single-ended.
#'
#' @param data Sequence reads objects, probably imported using other packages
#' as `ShortRead`. Allowed object types are [ShortRead::AlignedRead] and
#' [GenomicRanges::GRanges] with a `strand` attribute.
#' @param type Describes the type of reads. Values allowed are `single` for
#' single-ended reads and `paired` for paired-ended.
#' @param fragmentLen Expected original length of the sequenced fragments. See
#' details.
#' @param trim Length to trim the reads (or extend them if `trim` > read
#' length)
#' @param \dots Other parameters passed to `fragmentLenDetect` if no fixed
#' `fragmentLen` is given.
#'
#' @return [GenomicRanges::GRanges] containing the aligned/trimmed individual
#' reads.
#'
#' @note **IMPORTANT**: this information is only used to correct possible
#' strand-specific mapping, this package doesn't link the two ends of paired
#' reads.
#' @author Oscar Flores \email{oflores@@mmb.pcb.ub.es}
#' @seealso [ShortRead::AlignedRead], [GenomicRanges::GRanges],
#' [fragmentLenDetect()]
#' @keywords manip
#' @rdname processReads
#'
#' @examples
#' # Load data
#' data(nucleosome_htseq)
#'
#' # Process nucleosome reads, select only those shorter than 200bp
#' pr1 <- processReads(nucleosome_htseq, fragmentLen=200)
#'
#' # Now process them, but picking only the 40 bases surrounding the dyad
#' pr2 <- processReads(nucleosome_htseq, fragmentLen=200, trim=40)
#'
#' # Compare the results:
#' library(ggplot2)
#' cov1 <- as.vector(coverage.rpm(pr1)[["chr1"]])
#' cov2 <- as.vector(coverage.rpm(pr2)[["chr1"]])
#' plot_data <- rbind(
#' data.frame(x=seq_along(cov1), y=cov1, coverage="original"),
#' data.frame(x=seq_along(cov2), y=cov2, coverage="trimmed")
#' )
#' qplot(x=x, y=y, geom="line", data=plot_data, xlab="position",
#' ylab="coverage") + facet_grid(coverage~.)
#'
#' @export
#'
setGeneric(
"processReads",
function(data, type="single", fragmentLen, trim, ...)
standardGeneric("processReads")
)
#' @rdname processReads
#' @importMethodsFrom BiocGenerics strand
#' @importFrom grDevices rgb
#' @importMethodsFrom Biostrings nchar
#' @importMethodsFrom ShortRead position sread chromosome
setMethod(
"processReads",
signature(data="AlignedRead"),
function (data, type="single", fragmentLen, trim, ...) {
# require("ShortRead")
if (missing(fragmentLen)) {
if (type == "single") {
message(
" * fragmentLen not provided for strand correction,",
"infering automatically..."
)
fragmentLen <- fragmentLenDetect(data, ...)
message(paste(" * fragmentLen =", fragmentLen))
} else {
fragmentLen <- Inf # Don't remove anything
}
}
#######################################################################
if (type == "single") { # Special case for trim==fragmentLength
if (!missing(trim)) {
if (trim == fragmentLen) {
start <- position(data)
neg.strand <- start + nchar(sread(data))
sel <- strand(data) == "-"
start[sel] <- neg.strand[sel]
res <- GRanges(
ranges = IRanges(start=start, width=fragmentLen),
seqnames = chromosome(data)
)
return (res)
}
}
# If no trim restriction, use whole read length,
# else use trim length
sr_len <- nchar(sread(data))
if (!missing(trim)) {
sr_len <- trim
}
# Floor, then we will add 1 if needed
shift <- floor((fragmentLen - sr_len) / 2)
strand <- strand(data)
new_levels <- vector(mode="integer", length=3)
new_levels[which(levels(strand) == "-")] <- -1
new_levels[which(levels(strand) == "+")] <- +1
new_levels[which(levels(strand) == "*")] <- 0
levels(strand) <- new_levels
strand <- as.numeric(levels(strand))[strand]
# Correction for non rounded shifts in negative strand (ie 147/40)
# If the fragment_length - read_length is odd and negative strand,
# add 1 extra base
correct <- rep(0, length(shift))
sel <- (strand == -1) & ((fragmentLen-sr_len) %% 2 == 1)
correct[sel] <- 1
start <- position(data) + ((shift+correct) * strand)
res <- GRanges(
ranges = IRanges(start=start, width=sr_len),
space = chromosome(data)
)
#######################################################################
} else if (type == "paired") {
sr_len <- nchar(sread(data))
selection <- sr_len < fragmentLen
seqnames <- chromosome(data)[selection]
# Normal start or trimmed version if requested
start <- position(data)[selection]
if (!missing(trim)) {
start <- start + (sr_len[selection] / 2 - floor(trim / 2))
}
width <- sr_len[selection]
if (!missing(trim)) {
width <- trim
}
res <- GRanges(
ranges = IRanges(start=start, width=width),
seqnames = seqnames
)
#######################################################################
} else {
stop(paste(
"type must be 'single' for single-ended data or 'paired' ",
"for paired-ended"
))
}
return (res)
}
)
#' @rdname processReads
#' @importFrom GenomicRanges GRangesList
setMethod(
"processReads",
signature(data="CompressedGRangesList"),
function (data, type="single", fragmentLen, trim, ...)
GRangesList(lapply(data,
processReads,
type=type,
fragmentLen=fragmentLen,
trim=trim,
...))
)
#' @rdname processReads
#' @importFrom GenomicRanges GRanges
#' @importMethodsFrom BiocGenerics end
#' @importMethodsFrom GenomeInfoDb seqnames
#' @importMethodsFrom S4Vectors space
setMethod(
"processReads",
signature(data="GRanges"),
function (data, type="single", fragmentLen, trim, ...) {
if (missing(fragmentLen)) {
if (type == "single") {
message(
" * fragmentLen not provided for strand correction, ",
"infering automatically..."
)
fragmentLen <- fragmentLenDetect(data, ...)
message(paste(" * fragmentLen =", fragmentLen))
} else {
fragmentLen <- Inf # Don't remove anything
}
}
#######################################################################
if (type == "single") { # Special case for trim==fragmentLength
if (!missing(trim)) {
if (trim == fragmentLen) {
start <- start(data)
negStrand <- data$strand == "-"
start[negStrand] <- end(data)[negStrand] - fragmentLen
res <- GRanges(
ranges = IRanges(start=start, width=fragmentLen),
seqnames = seqnames(data)
)
return (res)
}
}
# If no trim restriction, use whole read length,
# else use trim length
if (missing(trim)) {
sr_len <- width(data)
} else {
sr_len <- trim
}
# Floor, then we will add 1 if needed
shift <- floor((fragmentLen - sr_len) / 2)
strand <- rep(1, length(data))
strand[data$strand == "-"] <- -1
# Correction for non rounded shifts in negative strand (ie 147/40)
# If the fragment_length - read_length is odd and negative strand,
# add 1 extra base
correct <- rep(0, length(strand))
correct[(strand == -1) & ((fragmentLen - sr_len) %% 2 == 1)] <- 1
start <- start(data) + ((shift+correct) * strand)
res <- GRanges(
ranges = IRanges(start=start, width=sr_len),
seqnames= seqnames(data)
)
#######################################################################
} else if (type == "paired") {
data <- data[width(data) < fragmentLen, ]
sr_len <- width(data)
# Normal start or trimmed version if requested
start <- start(data)
if (!missing(trim)) {
start <- start + ((sr_len / 2) - floor(trim / 2))
}
width <- sr_len
if (!missing(trim)) {
width <- trim
}
res <- GRanges(
ranges = IRanges(start=start, width=width),
seqnames = seqnames(data)
)
#######################################################################
} else {
stop(paste(
"type must be 'single' for single-ended data or",
"'paired' for paired-ended"
))
}
return (res)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.