R/makeBins.R

Defines functions variableWidthBins fixedWidthBins

Documented in fixedWidthBins variableWidthBins

#' Make fixed-width bins
#'
#' Make fixed-width bins based on given bin size.
#'
#' @param bamfile A BAM file from which the header is read to determine the chromosome lengths. If a \code{bamfile} is specified, option \code{assembly} is ignored.
#' @param assembly An assembly from which the chromosome lengths are determined. Please see \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}} for available assemblies. This option is ignored if \code{bamfile} is specified. Alternatively a data.frame generated by \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}}.
#' @param chrom.lengths A named character vector with chromosome lengths. Names correspond to chromosomes.
#' @param chromosome.format A character specifying the format of the chromosomes if \code{assembly} is specified. Either 'NCBI' for (1,2,3 ...) or 'UCSC' for (chr1,chr2,chr3 ...). If a \code{bamfile} or \code{chrom.lengths} is supplied, the format will be chosen automatically.
#' @param binsizes A vector of bin sizes in base pairs.
#' @param chromosomes A subset of chromosomes for which the bins are generated.
#' @return A \code{list()} of \code{\link{GRanges-class}} objects with fixed-width bins.
#' @author Aaron Taudt
#' @importFrom Rsamtools BamFile
#' @export
#'
#'@examples
#'## Make fixed-width bins of size 500kb and 1Mb
#'data(rn4_chrominfo)
#'chrom.lengths <- rn4_chrominfo$length
#'names(chrom.lengths) <- rn4_chrominfo$chromosome
#'bins <- fixedWidthBins(chrom.lengths=chrom.lengths, binsizes=c(5e5,1e6))
#'bins
#'
#'## Make bins using NCBI server (requires internet connection)
#'# bins <- fixedWidthBins(assembly='mm10', chromosome.format='NCBI', binsizes=c(5e5,1e6))
#'
fixedWidthBins <- function(bamfile=NULL, assembly=NULL, chrom.lengths=NULL, chromosome.format, binsizes=1e6, chromosomes=NULL) {

    ### Check user input ###
    if (is.null(bamfile) & is.null(assembly) & is.null(chrom.lengths)) {
        stop("Please specify either a 'bamfile', 'assembly' or 'chrom.lengths'")
    }
    if (is.null(bamfile) & is.null(chrom.lengths)) {
        trigger.error <- chromosome.format
    }

    ### Get chromosome lengths ###
    if (!is.null(bamfile)) {
        ptm <- startTimedMessage(paste0("Reading header from ", bamfile, " ..."))
        chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(bamfile))
        stopTimedMessage(ptm)
    } else if (!is.null(assembly)) {
        if (is.character(assembly)) {
            ptm <- startTimedMessage("Fetching chromosome lengths from UCSC ...")
            df <- GenomeInfoDb::getChromInfoFromUCSC(assembly)
            stopTimedMessage(ptm)
        } else if (is.data.frame(assembly)) {
            df <- assembly
        } else {
            stop("Unknown assembly")
        }
        chrom.lengths <- df$size
        if (chromosome.format=='UCSC') {
        } else if (chromosome.format=='NCBI') {
            df$chrom = sub('^chr', '', df$chrom)
        }
        names(chrom.lengths) <- df$chrom
        chrom.lengths <- chrom.lengths[!is.na(names(chrom.lengths))]
        chrom.lengths <- chrom.lengths[!is.na(chrom.lengths)]
    } else if (!is.null(chrom.lengths)) {
        chrom.lengths <- chrom.lengths[!is.na(names(chrom.lengths))]
        chrom.lengths <- chrom.lengths[!is.na(chrom.lengths)]
    }
    chroms.in.data <- names(chrom.lengths)
    if (is.null(chromosomes)) {
        chromosomes <- chroms.in.data
    }
    chroms2use <- intersect(chromosomes, chroms.in.data)
    ## Stop if none of the specified chromosomes exist
    if (length(chroms2use)==0) {
        chrstring <- paste0(chromosomes, collapse=', ')
        stop('Could not find length information for any of the specified chromosomes: ', chrstring)
    }
    ## Issue warning for non-existent chromosomes
    diff <- setdiff(chromosomes, chroms.in.data)
    if (length(diff)>0) {
        diffs <- paste0(diff, collapse=', ')
        warning('Could not find length information for the following chromosomes: ', diffs)
    }

    ### Making fixed-width bins ###
    bins.list <- list()
    for (binsize in binsizes) {
      
        ptm <- startTimedMessage("Making fixed-width bins for bin size ", binsize, " ...")
        chrom.lengths.floor <- floor(chrom.lengths / binsize) * binsize
        clfloor2use <- chrom.lengths.floor[chroms2use]
        clfloor2use <- clfloor2use[clfloor2use >= binsize]
        if (length(clfloor2use) == 0) {
            stop("All selected chromosomes are smaller than binsize ", binsize)
        }
        bins <- unlist(GenomicRanges::tileGenome(clfloor2use, tilewidth=binsize), use.names=FALSE)
        if (any(width(bins)!=binsize)) {
            stop("tileGenome failed")
        }
        seqlevels(bins) <- chroms2use
        seqlengths(bins) <- chrom.lengths[seqlevels(bins)]
        skipped.chroms <- setdiff(chromosomes, as.character(unique(seqnames(bins))))
        bins <- dropSeqlevels(bins, skipped.chroms, pruning.mode = 'coarse')
        bins.list[[as.character(binsize)]] <- bins

        if (length(skipped.chroms)>0) {
            warning("The following chromosomes were dropped because they are smaller than binsize ", binsize, ": ", paste0(skipped.chroms, collapse=', '))
        }
        stopTimedMessage(ptm)
        
    }
    
    return(bins.list)

}


#' Make variable-width bins
#' 
#' Make variable-width bins based on a reference BAM file. This can be a simulated file (produced by TODO: insert link and aligned with your favourite aligner) or a real reference.
#' 
#' Variable-width bins are produced by first binning the reference BAM file with fixed-width bins and selecting the desired number of reads per bin as the (non-zero) maximum of the histogram. A new set of bins is then generated such that every bin contains the desired number of reads.
#' 
#' @param reads A \code{\link{GRanges-class}} with reads. See \code{\link{readBamFileAsGRanges}} and \code{\link{readBedFileAsGRanges}}.
#' @param binsizes A vector with binsizes. Resulting bins will be close to the specified binsizes.
#' @param chromosomes A subset of chromosomes for which the bins are generated.
#' @return A \code{list()} of \code{\link{GRanges-class}} objects with variable-width bins.
#' @author Aaron Taudt
#' @importFrom S4Vectors endoapply
#' @export
#'
#'@examples
#'## Get an example BAM file with ChIP-seq reads
#'bamfile <- system.file("extdata", "euratrans", "lv-H3K4me3-BN-female-bio1-tech1.bam",
#'                       package="chromstaRData")
#'## Read the file into a GRanges object
#'reads <- readBamFileAsGRanges(bamfile, chromosomes='chr12', pairedEndReads=FALSE,
#'                     min.mapq=10, remove.duplicate.reads=TRUE)
#'## Make variable-width bins of size 1000bp
#'bins <- variableWidthBins(reads, binsizes=1000)
#'## Plot the distribution of binsizes
#'hist(width(bins[['1000']]), breaks=50)
#'
variableWidthBins <- function(reads, binsizes, chromosomes=NULL) {
    
    ### Check user input ###
    chroms.in.data <- unique(seqnames(reads))
    if (is.null(chromosomes)) {
        chromosomes <- chroms.in.data
    }
    chroms2use <- intersect(chromosomes, chroms.in.data)
    ## Stop if non of the specified chromosomes exist
    if (length(chroms2use)==0) {
        chrstring <- paste0(chromosomes, collapse=', ')
        stop('Could not find length information for any of the specified chromosomes: ', chrstring)
    }
    ## Issue warning for non-existent chromosomes
    diff <- setdiff(chromosomes, chroms.in.data)
    if (length(diff)>0) {
        diffs <- paste0(diff, collapse=', ')
        warning('Could not find length information for the following chromosomes: ', diffs)
    }

    ## Drop unwanted seqlevels
    reads <- reads[seqnames(reads) %in% chroms2use]
    reads <- keepSeqlevels(reads, chroms2use)

    ## Make fixed width bins
    ptm <- startTimedMessage("Binning reads in fixed-width windows ...")
    binned.list <- suppressMessages( binReads(reads, assembly=NULL, binsizes=binsizes, chromosomes=chromosomes) )
    stopTimedMessage(ptm)
    
    ## Sort the reads
    strand(reads) <- '*'
    reads <- sort(reads)
    
    ## Loop over binsizes
    bins.list <- list()
    for (i1 in 1:length(binsizes)) {
        binsize <- binsizes[i1]
        ptm <- startTimedMessage("Making variable-width windows for bin size ", binsize, " ...")
        if (is(binned.list,'GRanges')) {
            binned <- binned.list
        } else {
            binned <- binned.list[[i1]]
        }
        ## Get mode of histogram
        tab <- table(binned$counts)
        modecount <- as.integer(names(which.max(tab[names(tab)!=0])))
        ## Pick only every modecount read
        subreads <- GRangesList()
        skipped.chroms <- character()
        for (chrom in chroms2use) {
            reads.chr <- reads[seqnames(reads)==chrom]
            if (length(reads.chr) >= modecount) {
                idx <- seq(modecount, length(reads.chr), by=modecount)
                subreads[[chrom]] <- reads.chr[idx]
            } else {
                skipped.chroms[chrom] <- chrom
            }
        }
        if (length(skipped.chroms)>0) {
            warning("The following chromosomes were dropped because they are smaller than binsize ", binsize, ": ", paste0(skipped.chroms, collapse=', '))
        }
        subreads <- unlist(subreads, use.names=FALSE)
        ## Adjust length of reads to get consecutive bins
        subreads <- resize(subreads, width=1)
        ## Make new bins
        bins <- gaps(subreads, start=1L, end=seqlengths(subreads)-1L) # gaps until seqlengths-1 because we have to add 1 later to get consecutive bins
        bins <- bins[strand(bins)=='*']
        end(bins) <- end(bins) + 1
        ## We don't want incomplete bins at the end
        bins.split <- split(bins, seqnames(bins))
        bins.split <- S4Vectors::endoapply(bins.split, function(x) { x[-length(x)] })
        bins <- unlist(bins.split, use.names=FALSE)
        ## Remove skipped chromosomes
        bins <- bins[!seqnames(bins) %in% skipped.chroms]
        bins <- dropSeqlevels(bins, skipped.chroms, pruning.mode = 'coarse')

        bins.list[[as.character(binsize)]] <- bins
        stopTimedMessage(ptm)
    }
    
    return(bins.list)

}
ataudt/chromstaR documentation built on Dec. 26, 2021, 12:07 a.m.