R/importReads.R

Defines functions readBedFileAsGRanges readBamFileAsGRanges

Documented in readBamFileAsGRanges readBedFileAsGRanges

#' Import BAM file into GRanges
#'
#' Import aligned reads from a BAM file into a \code{\link{GRanges-class}} object.
#'
#' @param bamfile A sorted BAM file.
#' @param bamindex BAM index file. Can be specified without the .bai ending. If the index file does not exist it will be created and a warning is issued.
#' @param chromosomes If only a subset of the chromosomes should be imported, specify them here.
#' @param pairedEndReads Set to \code{TRUE} if you have paired-end reads in your BAM files (not implemented for BED files).
#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed.
#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=0} to keep all reads.
#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments due to mapping errors of paired end reads.
#' @param blacklist A \code{\link{GRanges-class}} or a bed(.gz) file with blacklisted regions. Reads falling into those regions will be discarded.
#' @param what A character vector of fields that are returned. Uses the \code{Rsamtools::scanBamWhat} function. See \code{Rsamtools::ScanBamParam} to see what is available.
#' @return A \code{\link{GRanges-class}} object containing the reads.
#' @importFrom Rsamtools indexBam BamFile ScanBamParam scanBamFlag
#' @importFrom GenomicAlignments readGAlignmentPairs readGAlignments first
#' @importFrom S4Vectors queryHits
#' @export
#'
#'@examples
#'## Get an example BAM file with ChIP-seq reads
#'bamfile <- system.file("extdata", "euratrans", "lv-H3K4me3-BN-female-bio1-tech1.bam",
#'                       package="chromstaRData")
#'## Read the file into a GRanges object
#'reads <- readBamFileAsGRanges(bamfile, chromosomes='chr12', pairedEndReads=FALSE,
#'                     min.mapq=10, remove.duplicate.reads=TRUE)
#'print(reads)
#'
readBamFileAsGRanges <- function(bamfile, bamindex=bamfile, chromosomes=NULL, pairedEndReads=FALSE, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000, blacklist=NULL, what='mapq') {

    ## Input checks
    if (!is.null(blacklist)) {
        if ( !(is.character(blacklist) | is(blacklist,'GRanges')) ) {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
    }

    ## Check if bamindex exists
    bamindex.raw <- sub('\\.bai$', '', bamindex)
    bamindex <- paste0(bamindex.raw,'.bai')
    if (!file.exists(bamindex)) {
        ptm <- startTimedMessage("Making bam-index file ...")
        bamindex.own <- Rsamtools::indexBam(bamfile)
        # warning("Couldn't find BAM index-file ",bamindex,". Creating our own file ",bamindex.own," instead.")
        bamindex <- bamindex.own
        stopTimedMessage(ptm)
    }
    chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(bamfile))
    chroms.in.data <- names(chrom.lengths)
    if (is.null(chromosomes)) {
        chromosomes <- chroms.in.data
    }
    chroms2use <- intersect(chromosomes, chroms.in.data)
    ## Stop if none of the specified chromosomes exist
    if (length(chroms2use)==0) {
        chrstring <- paste0(chromosomes, collapse=', ')
        stop('The specified chromosomes ', chrstring, ' do not exist in the data.')
    }
    ## Issue warning for non-existent chromosomes
    diff <- setdiff(chromosomes, chroms.in.data)
    if (length(diff)>0) {
        diffs <- paste0(diff, collapse=', ')
        warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.'))
    }

    ## Import the file into GRanges
    ptm <- startTimedMessage("Reading file ",basename(bamfile)," ...")
    gr <- GenomicRanges::GRanges(seqnames=chroms2use, ranges=IRanges(start=rep(1, length(chroms2use)), end=chrom.lengths[chroms2use]))
    if (!remove.duplicate.reads) {
        if (pairedEndReads) {
            data.raw <- GenomicAlignments::readGAlignmentPairs(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what))
        } else {
            data.raw <- GenomicAlignments::readGAlignments(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what))
        }
    } else {
        if (pairedEndReads) {
            data.raw <- GenomicAlignments::readGAlignmentPairs(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=Rsamtools::scanBamFlag(isDuplicate=FALSE)))
        } else {
            data.raw <- GenomicAlignments::readGAlignments(bamfile, index=bamindex, param=Rsamtools::ScanBamParam(which=range(gr), what=what, flag=Rsamtools::scanBamFlag(isDuplicate=FALSE)))
        }
    }
    stopTimedMessage(ptm)

    if (length(data.raw) == 0) {
        if (pairedEndReads) {
            stop(paste0("No reads imported. Does your file really contain paired end reads? Try with 'pairedEndReads=FALSE'"))
        }
        stop(paste0('No reads imported! Check your BAM-file ', bamfile))
    }
    
    ## Filter by mapping quality
    if (pairedEndReads) {
        ptm <- startTimedMessage("Converting to GRanges ...")
        data <- methods::as(data.raw, 'GRanges') # treat as one fragment
        stopTimedMessage(ptm)

        ptm <- startTimedMessage("Filtering reads (mapq >= ", min.mapq, ") ...")
        if (min.mapq>0) {
            mapq.first <- mcols(GenomicAlignments::first(data.raw))$mapq
            mapq.last <- mcols(GenomicAlignments::last(data.raw))$mapq
            mapq.mask <- mapq.first >= min.mapq & mapq.last >= min.mapq
            if (any(is.na(mapq.mask))) {
                warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
            }
            data <- data[which(mapq.mask)]
        }
        # Filter out too long fragments
        data <- data[width(data)<=max.fragment.width]
        stopTimedMessage(ptm)
    } else {
        ptm <- startTimedMessage("Converting to GRanges ...")
        data <- methods::as(data.raw, 'GRanges')
        stopTimedMessage(ptm)

        ptm <- startTimedMessage("Filtering reads (mapq >= ", min.mapq, ") ...")
        if (min.mapq>0) {
            if (any(is.na(mcols(data)$mapq))) {
                warning(paste0(bamfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
                mcols(data)$mapq[is.na(mcols(data)$mapq)] <- -1
            }
            data <- data[mcols(data)$mapq >= min.mapq]
        }
        # Filter out too long fragments
        data <- data[width(data)<=max.fragment.width]
        stopTimedMessage(ptm)
    }

    if (length(data)==0) {
        stop("No reads present after filtering. Please lower your 'min.mapq'.")
    }

    ## Exclude reads falling into blacklisted regions
    if (!is.null(blacklist)) {
        ptm <- startTimedMessage("Filtering blacklisted regions ...")
        if (is.character(blacklist)) {
            if (grepl('^chr', seqlevels(data)[1])) {
                chromosome.format <- 'UCSC'
            } else {
                chromosome.format <- 'NCBI'
            }
            black <- readCustomBedFile(blacklist, skip=0, chromosome.format=chromosome.format)
        } else if (is(blacklist,'GRanges')) {
            black <- blacklist
        } else {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
        overlaps <- findOverlaps(data, black)
        idx <- setdiff(1:length(data), S4Vectors::queryHits(overlaps))
        data <- data[idx]
        stopTimedMessage(ptm)
    }
    
    ## Select only desired chromosomes
    ptm <- startTimedMessage("Subsetting chromosomes ...")
    data <- data[seqnames(data) %in% chroms2use]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    ## Drop seqlevels where seqlength is NA
    na.seqlevels <- seqlevels(data)[is.na(seqlengths(data))]
    data <- data[seqnames(data) %in% seqlevels(data)[!is.na(seqlengths(data))]]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    if (length(na.seqlevels) > 0) {
        warning("Dropped seqlevels because no length information was available: ", paste0(na.seqlevels, collapse=', '))
    }
    stopTimedMessage(ptm)

    if (length(data) == 0) {
        stop(paste0('No reads imported!'))
    }

    

    return(data)

}


#' Import BED file into GRanges
#'
#' Import aligned reads from a BED file into a \code{\link{GRanges-class}} object.
#'
#' @param bedfile A file with aligned reads in BED-6 format. The columns have to be c('chromosome','start','end','description','mapq','strand').
#' @param assembly Please see \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}} for available assemblies. Only necessary when importing BED files. BAM files are handled automatically. Alternatively a data.frame with columns 'chromosome' and 'length'.
#' @param chromosomes If only a subset of the chromosomes should be imported, specify them here.
#' @param remove.duplicate.reads A logical indicating whether or not duplicate reads should be removed.
#' @param min.mapq Minimum mapping quality when importing from BAM files. Set \code{min.mapq=0} to keep all reads.
#' @param max.fragment.width Maximum allowed fragment length. This is to filter out erroneously wrong fragments.
#' @param blacklist A \code{\link{GRanges-class}} or a bed(.gz) file with blacklisted regions. Reads falling into those regions will be discarded.
#' @return A \code{\link{GRanges-class}} object containing the reads.
#' @importFrom utils read.table
#' @importFrom S4Vectors queryHits
#' @export
#'
#'@examples
#'## Get an example BED file with single-cell-sequencing reads
#'bedfile <- system.file("extdata", "liver-H3K4me3-BN-male-bio2-tech1.bed.gz",
#'                        package="chromstaRData")
#'## Read the file into a GRanges object
#'data(rn4_chrominfo)
#'reads <- readBedFileAsGRanges(bedfile, assembly=rn4_chrominfo, chromosomes='chr12',
#'                     min.mapq=10, remove.duplicate.reads=TRUE)
#'print(reads)
#'
readBedFileAsGRanges <- function(bedfile, assembly, chromosomes=NULL, remove.duplicate.reads=FALSE, min.mapq=10, max.fragment.width=1000, blacklist=NULL) {

    ## Input checks
    if (!is.null(blacklist)) {
        if ( !(is.character(blacklist) | is(blacklist,'GRanges')) ) {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
    }

    # File with reads, specify classes for faster import (0-based)
    ptm <- startTimedMessage("Reading file ",basename(bedfile)," ...")
    classes <- c('character','numeric','numeric','NULL','integer','character')
    # Determine if data has trackline
    firstline <- utils::read.table(bedfile, nrows = 1)
    if (grepl('^track', firstline[1,1])) {
        skip <- 1
    } else {
        skip <- 0
    }
    data.raw <- utils::read.table(bedfile, colClasses=classes, skip=skip)
    # Convert to GRanges object
    data <- GenomicRanges::GRanges(seqnames=data.raw[,1], ranges=IRanges(start=data.raw[,2]+1, end=data.raw[,3]), strand=data.raw[,5])    # start+1 to go from [0,x) -> [1,x]
    mcols(data)$mapq <- data.raw[,4]
    remove(data.raw)
    stopTimedMessage(ptm)
    ## Read chromosome length information
    if (is.character(assembly)) {
        if (file.exists(assembly)) {
            df <- utils::read.table(assembly, sep='\t', header=TRUE)
        } else {
            ptm <- startTimedMessage("Fetching chromosome lengths from UCSC ...")
            df.chroms <- GenomeInfoDb::getChromInfoFromUCSC(assembly)
            stopTimedMessage(ptm)
            df <- df.chroms[,c('chrom','size')]
            if (grepl('^chr',seqlevels(data)[1])) {
            } else {
                df$chrom = sub('^chr', '', df$chrom)
            }
        }
    } else if (is.data.frame(assembly)) {
        df <- assembly
    } else {
        stop("'assembly' must be either a data.frame with columns 'chromosome' and 'length' or a character specifying the assembly.")
    }
    chrom.lengths <- df[,2]
    names(chrom.lengths) <- df[,1]
    seqlengths(data) <- as.numeric(chrom.lengths[names(seqlengths(data))])

    chroms.in.data <- seqlevels(data)
    if (is.null(chromosomes)) {
        chromosomes <- chroms.in.data
    }
    chroms2use <- intersect(chromosomes, chroms.in.data)
    ## Stop if none of the specified chromosomes exist
    if (length(chroms2use)==0) {
        chrstring <- paste0(chromosomes, collapse=', ')
        stop('The specified chromosomes ', chrstring, ' do not exist in the data.')
    }
    ## Issue warning for non-existent chromosomes
    diff <- setdiff(chromosomes, chroms.in.data)
    if (length(diff)>0) {
        diffs <- paste0(diff, collapse=', ')
        warning(paste0('Not using chromosomes ', diffs, ' because they are not in the data.'))
    }

    ## Select only desired chromosomes
    ptm <- startTimedMessage("Subsetting chromosomes ...")
    data <- data[seqnames(data) %in% chroms2use]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    ## Drop seqlevels where seqlength is NA
    na.seqlevels <- seqlevels(data)[is.na(seqlengths(data))]
    data <- data[seqnames(data) %in% seqlevels(data)[!is.na(seqlengths(data))]]
    data <- keepSeqlevels(data, as.character(unique(seqnames(data))))
    if (length(na.seqlevels) > 0) {
        warning("Dropped seqlevels because no length information was available: ", paste0(na.seqlevels, collapse=', '))
    }
    stopTimedMessage(ptm)

    if (length(data) == 0) {
        stop(paste0('No reads imported!'))
    }

    ## Filter by mapping quality
    ptm <- startTimedMessage("Filtering reads (mapq >= ", min.mapq, ") ...")
    if (min.mapq>0) {
        if (any(is.na(mcols(data)$mapq))) {
            warning(paste0(bedfile,": Reads with mapping quality NA (=255 in BAM file) found and removed. Set 'min.mapq=NULL' to keep all reads."))
            mcols(data)$mapq[is.na(mcols(data)$mapq)] <- -1
        }
        data <- data[mcols(data)$mapq >= min.mapq]
    }
    # Filter out too long fragments
    data <- data[width(data)<=max.fragment.width]
    stopTimedMessage(ptm)
    
    if (length(data)==0) {
        stop("No reads present after filtering. Please lower your 'min.mapq'.")
    }

    ## Exclude reads falling into blacklisted regions
    if (!is.null(blacklist)) {
        ptm <- startTimedMessage("Filtering blacklisted regions ...")
        if (is.character(blacklist)) {
            if (grepl('^chr', seqlevels(data)[1])) {
                chromosome.format <- 'UCSC'
            } else {
                chromosome.format <- 'NCBI'
            }
            black <- readCustomBedFile(blacklist, skip=0, chromosome.format=chromosome.format)
        } else if (is(blacklist,'GRanges')) {
            black <- blacklist
        } else {
            stop("'blacklist' has to be either a bed(.gz) file or a GRanges object")
        }
        overlaps <- findOverlaps(data, black)
        idx <- setdiff(1:length(data), S4Vectors::queryHits(overlaps))
        data <- data[idx]
        stopTimedMessage(ptm)
    }

    return(data)

}
ataudt/chromstaR documentation built on Dec. 26, 2021, 12:07 a.m.