R/binReads.R
In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

Documented in binReads

#' Convert aligned reads from various file formats into read counts in equidistant bins
#'
#' Convert aligned reads in .bam or .bed(.gz) format into read counts in equidistant windows.
#'
#' Convert aligned reads from .bam or .bed(.gz) files into read counts in equidistant windows (bins). This function uses \code{GenomicRanges::countOverlaps} to calculate the read counts, or alternatively \code{bamsignals::bamProfile} if option \code{use.bamsignals} is set (only effective for .bam files).
#'
#' @aliases binning
#' @param file A file with aligned reads. Alternatively a \code{\link{GRanges-class}} with aligned reads.
#' @param experiment.table An \code{\link{experiment.table}} containing the supplied \code{file}. This is necessary to uniquely identify the file in later steps of the workflow. Set to \code{NULL} if you don't have it (not recommended).
#' @param ID Optional ID to select a row from the \code{experiment.table}. Only necessary if the experiment table contains the same file in multiple positions in column 'file'.
#' @inheritParams readBamFileAsGRanges
#' @inheritParams readBedFileAsGRanges
#' @param binsizes An integer vector specifying the bin sizes to use.
#' @param stepsizes An integer vector specifying the step size. One number can be given for each element in \code{binsizes}, \code{reads.per.bin} and \code{bins} (in that order).
#' @param bins A \code{\link{GRanges-class}} or a named \code{list()} with \code{\link{GRanges-class}} containing precalculated bins produced by \code{\link{fixedWidthBins}} or \code{\link{variableWidthBins}}. Names of the list must correspond to the binsize. If the list is unnamed, an attempt is made to automatically determine the binsize.
#' @param reads.per.bin Approximate number of desired reads per bin. The bin size will be selected accordingly.
#' @param variable.width.reference A BAM file that is used as reference to produce variable width bins. See \code{\link{variableWidthBins}} for details.
#' @param chromosomes If only a subset of the chromosomes should be binned, specify them here.
#' @param use.bamsignals If \code{TRUE} the \pkg{\link[bamsignals]{bamsignals}} package is used for parsing of BAM files. This gives tremendous speed advantage for only one binsize but linearly increases for multiple binsizes, while \code{use.bamsignals=FALSE} has a binsize dependent runtime and might be faster if many binsizes are calculated.
#' @param format One of \code{c('bed','bam','GRanges',NULL)}. With \code{NULL} the format is determined automatically from the file ending.
#' @return If only one bin size was specified for option \code{binsizes}, the function returns a single \code{\link{GRanges-class}} object with meta data column 'counts' that contains the read count. If multiple \code{binsizes} were specified , the function returns a named \code{list()} of \link{GRanges-class} objects.
#' @importFrom Rsamtools BamFile indexBam
#' @importFrom bamsignals bamCount
#' @export
#'
#'@examples
#'## Get an example BAM file with ChIP-seq reads
#'file <- system.file("extdata", "euratrans",
#'                    "lv-H3K27me3-BN-male-bio2-tech1.bam",
#'                     package="chromstaRData")
#'## Bin the file into bin size 1000bp
#'data(rn4_chrominfo)
#'data(experiment_table)
#'binned <- binReads(file, experiment.table=experiment_table,
#'                   assembly=rn4_chrominfo, binsizes=1000,
#'                   stepsizes=500, chromosomes='chr12')
#'print(binned)
#'
binReads <- function(file, experiment.table=NULL, ID=NULL, assembly, bamindex=file, chromosomes=NULL, pairedEndReads=FALSE, min.mapq=10, remove.duplicate.reads=TRUE, max.fragment.width=1000, blacklist=NULL, binsizes=1000, stepsizes=binsizes/2, reads.per.bin=NULL, bins=NULL, variable.width.reference=NULL, use.bamsignals=TRUE, format=NULL) {

    ## Determine format
    if (is.null(format)) {
        if (is.character(file)) {
            file.clean <- sub('\\.gz$','', file)
            format <- rev(strsplit(file.clean, '\\.')[[1]])[1]
        } else if (is(file,'GRanges')) {
            format <- 'GRanges'
        } else {
            stop("Could not determine format automatically. Please specify it via the 'format' parameter.")
        }
    }
    if (! format %in% c('bed','bam','GRanges')) {
        stop("Unknown format. Needs to be one of 'bed', 'bam' or 'GRanges'.")
    }

    ## Sanity checks
    if (format=='bed') {
        temp <- assembly # trigger error if not defined
    }
    if (is(bins,'GRanges')) {
        bins <- list(bins)
        names(bins) <- width(bins[[1]])[1]
    }
    if (is(bins,'list')) {
        if (is.null(names(bins))) {
            names(bins) <- sapply(bins, function(x) { width(x)[1] })
        }
    }

    ## Variables for bamsignals
    paired.end <- 'ignore'
    if (pairedEndReads) {
        paired.end <- 'filter'
    }
    
    ## Create INFO object as row from the experiment.table
    if (!is.null(experiment.table)) {
        check.experiment.table(experiment.table)
        info <- experiment.table[grepl(basename(file),basename(as.character(experiment.table$file))),]
        if (nrow(info) > 1) {
            if (is.null(ID)) {
                stop("Please specify argument 'ID' if multiple files in 'experiment.table' are identical.")
            }
            info <- info[ID, ]
        }
        ID <- paste0(info$mark, '-', info$condition, '-rep', info$rep)
        info$ID <- ID
        if (pairedEndReads != info$pairedEndReads) {
            warning("Option 'pairedEndReads' overwritten by entry in 'experiment.table'.")
        }
    } else {
        info <- NULL
        if (format=='GRanges') {
            ID <- 'GRanges'
        } else {
            ID <- basename(file)
        }
    }

    ### Read in the data
    data <- NULL
    if (format == "bed") {
        ## BED (0-based)
        if (!remove.duplicate.reads) {
            data <- readBedFileAsGRanges(file, assembly=assembly, chromosomes=chromosomes, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width, blacklist=blacklist)
        } else {
            data <- readBedFileAsGRanges(file, assembly=assembly, chromosomes=chromosomes, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width, blacklist=blacklist)
        }
        chrom.lengths <- seqlengths(data)
    } else if (format == "bam") {
        ## BAM (1-based)
        if (use.bamsignals) {
            ## Check if bamindex exists
            bamindex.raw <- sub('\\.bai$', '', bamindex)
            bamindex <- paste0(bamindex.raw,'.bai')
            if (!file.exists(bamindex)) {
                ptm <- startTimedMessage("Making bam-index file ...")
                bamindex.own <- Rsamtools::indexBam(file)
                # warning("Couldn't find BAM index-file ",bamindex,". Creating our own file ",bamindex.own," instead.")
                bamindex <- bamindex.own
                stopTimedMessage(ptm)
            }
            ptm <- startTimedMessage(paste0("Reading header from ", file, " ..."))
            chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(file))
            stopTimedMessage(ptm)
        } else {
            if (!remove.duplicate.reads) {
                data <- readBamFileAsGRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=FALSE, min.mapq=min.mapq, max.fragment.width=max.fragment.width, blacklist=blacklist)
            } else {
                data <- readBamFileAsGRanges(file, bamindex, chromosomes=chromosomes, pairedEndReads=pairedEndReads, remove.duplicate.reads=TRUE, min.mapq=min.mapq, max.fragment.width=max.fragment.width, blacklist=blacklist)
            }
            chrom.lengths <- seqlengths(data)
        }
    } else if (format == "GRanges") {
        ## GRanges (1-based)
        data <- file
        chrom.lengths <- seqlengths(data)
    } else {
        stop("Unknown file format.")
    }

    ## Select chromosomes to bin
    if (is.null(chromosomes)) {
        chromosomes <- names(chrom.lengths)
    }
    chroms2use <- intersect(chromosomes, names(chrom.lengths))
    ## Stop if none of the specified chromosomes exist
    if (length(chroms2use)==0) {
      chrstring <- paste0(chromosomes, collapse=', ')
      stop('Could not find length information for any of the specified chromosomes: ', chrstring)
    }
    skipped.chroms <- setdiff(chromosomes, chroms2use)
    if (length(skipped.chroms)>0) {
        warning("Could not find chromosomes ", paste0(skipped.chroms, collapse=', '), ".")
    }

    ## Select only desired chromosomes
    if (!is.null(data)) {
        ptm <- startTimedMessage("Subsetting specified chromosomes ...")
        data <- data[seqnames(data) %in% chroms2use]
        data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
        ## Drop seqlevels where seqlength is NA
        na.seqlevels <- seqlevels(data)[is.na(seqlengths(data))]
        data <- data[seqnames(data) %in% seqlevels(data)[!is.na(seqlengths(data))]]
        data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
        if (length(na.seqlevels) > 0) {
            warning("Dropped seqlevels because no length information was available: ", paste0(na.seqlevels, collapse=', '))
        }
        stopTimedMessage(ptm)
    }
  
    ### Make variable width bins ###
    if (!is.null(variable.width.reference)) {
        message("Making variable width bins:")
        if (is.character(variable.width.reference)) {
            variable.width.reference.clean <- sub('\\.gz$','', variable.width.reference)
            vformat <- rev(strsplit(variable.width.reference.clean, '\\.')[[1]])[1]
        } else if (is(variable.width.reference,'GRanges')) {
            vformat <- 'GRanges'
        }
        if (vformat == 'bam') {
            refreads <- readBamFileAsGRanges(variable.width.reference, bamindex=variable.width.reference, chromosomes=chroms2use, pairedEndReads=pairedEndReads, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, max.fragment.width=max.fragment.width, blacklist=blacklist)
        } else if (vformat == 'bed') {
            refreads <- readBedFileAsGRanges(variable.width.reference, assembly=assembly, chromosomes=chroms2use, remove.duplicate.reads=remove.duplicate.reads, min.mapq=min.mapq, max.fragment.width=max.fragment.width, blacklist=blacklist)
        }
        bins.binsize <- variableWidthBins(refreads, binsizes=binsizes, chromosomes=chroms2use)
        message("Finished making variable width bins.")
    }
        
    ### Make fixed width bins ###
    if (is.null(variable.width.reference)) {
        bins.binsize <- fixedWidthBins(chrom.lengths=chrom.lengths, chromosomes=chroms2use, binsizes=binsizes)
    }
  
    ### Make reads.per.bin bins ###
    bins.rpb <- NULL
    if (!is.null(data)) {
        numcountsperbp <- length(data) / sum(as.numeric(seqlengths(data)))
        binsizes.rpb <- round(reads.per.bin / numcountsperbp, -2)
        bins.rpb <- fixedWidthBins(chrom.lengths=chrom.lengths, chromosomes=chroms2use, binsizes=binsizes.rpb)
    } else if (use.bamsignals & !is.null(reads.per.bin)) {
        ptm <- startTimedMessage("Parsing bamfile to determine binsize for reads.per.bin option ...")
        bins.helper <- suppressMessages( fixedWidthBins(chrom.lengths=chrom.lengths, chromosomes=chroms2use, binsizes=1e6)[[1]] )
        counts <- tryCatch({
            counts <- bamsignals::bamCount(file, bins.helper, mapqual=min.mapq, paired.end=paired.end, tlenFilter=c(0, max.fragment.width), verbose=FALSE)
        }, error = function(err) {
            counts <- bamsignals::bamCount(file, bins.helper, mapqual=min.mapq, paired.end=paired.end, paired.end.max.frag.length=max.fragment.width, verbose=FALSE)
        })
        stopTimedMessage(ptm)
        numcountsperbp <- sum(as.numeric(counts)) / sum(as.numeric(chrom.lengths[chroms2use]))
        binsizes.rpb <- round(reads.per.bin / numcountsperbp, -2)
        bins.rpb <- fixedWidthBins(chrom.lengths=chrom.lengths, chromosomes=chroms2use, binsizes=binsizes.rpb)
    }
    
    ### Combine in bins.list ###
    bins.list <- c(bins.binsize, bins.rpb, bins)
    if (length(stepsizes) != length(bins.list)) {
        stop("Need one step size for each binsize. Make sure that length(stepsizes) == length(binsizes) + length(reads.per.bin) + length(bins).")
    }
 
    ### Loop over all binsizes ###
    if (!use.bamsignals | format=='bed' | format=='GRanges') {
        ptm <- startTimedMessage("Splitting into strands ...")
        data.plus <- data[strand(data)=='+']
        data.minus <- data[strand(data)=='-']
        data.star <- data[strand(data)=='*']
        ptm <- stopTimedMessage(ptm)
    }
    for (ibinsize in 1:length(bins.list)) {
        binsize <- as.numeric(names(bins.list)[ibinsize])
        bins <- bins.list[[ibinsize]]
        stepsize <- stepsizes[ibinsize]
        numsteps <- ceiling(binsize / stepsize)
        offsets <- stepsize * ((1:numsteps) - 1)
        acounts <- array(NA, dim = c(length(bins), numsteps), dimnames = list(bin=NULL, offset=offsets))
        for (ioffset in 1:numsteps) {
            offset <- offsets[ioffset]
            bins.offset <- suppressWarnings( shift(bins, shift = offset) )
        
            if (format == 'bam' & use.bamsignals) {
                ptm <- startTimedMessage("Counting overlaps for binsize ", binsize, " with offset ", offset, " ...")
                suppressWarnings( # suppress warning due to offset over seqlength
                bins.offset$counts <- tryCatch({
                    bins.offset$counts <- bamsignals::bamCount(file, bins.offset, mapqual=min.mapq, paired.end=paired.end, tlenFilter=c(0, max.fragment.width), verbose=FALSE)
                }, error = function(err) {
                    bins.offset$counts <<- bamsignals::bamCount(file, bins.offset, mapqual=min.mapq, paired.end=paired.end, paired.end.max.frag.length=max.fragment.width, verbose=FALSE)
                })
                )
                stopTimedMessage(ptm)
                
            } else {
                readsperbin <- round(length(data) / sum(as.numeric(seqlengths(data))) * binsize, 2)
                ptm <- startTimedMessage("Counting overlaps for binsize ",binsize," with offset ", offset, ", with on average ",readsperbin," reads per bin ...")
                scounts <- suppressWarnings( GenomicRanges::countOverlaps(bins.offset, data.star) )
                mcounts <- suppressWarnings( GenomicRanges::countOverlaps(bins.offset, data.minus) )
                pcounts <- suppressWarnings( GenomicRanges::countOverlaps(bins.offset, data.plus) )
                counts <- mcounts + pcounts + scounts
                countmatrix <- matrix(c(counts,mcounts,pcounts), ncol=3)
                colnames(countmatrix) <- c('counts','mcounts','pcounts')
                bins.offset$counts <- countmatrix[, 'counts']
                stopTimedMessage(ptm)
        
            }
            acounts[, as.character(offset)] <- mcols(bins.offset)[,'counts']
        
        }
        
        bins <- bins.list[[ibinsize]]
        bins$counts <- acounts
        bins.list[[ibinsize]] <- bins
        attr(bins.list[[ibinsize]], 'info') <- info
        attr(bins.list[[ibinsize]], 'min.mapq') <- min.mapq
            
    } ### end loop binsizes ###

    if (length(bins.list) == 1) {
        return(bins.list[[1]])
    } else {
        return(bins.list)
    }


}
Any scripts or data that you put into this service are public.
chromstaR documentation built on Nov. 8, 2020, 8:29 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
chromstaR
Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

R/binReads.R
In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

Defines functions binReads

Documented in binReads

Try the chromstaR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

chromstaR Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

R/binReads.R In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

Defines functions binReads

Documented in binReads

Try the chromstaR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

chromstaR
Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data

R/binReads.R
In chromstaR: Combinatorial and Differential Chromatin State Analysis for ChIP-Seq Data