R/bam_handler.R

#' A class to manage BAM files.
#'
#' This class will allow to load, convert and normalize alignments and regions
#' files/data.
#'
#' @section Constructor:
#' \describe{
#'    \item{}{\code{bh <- Bam_Handler$new(bam_files, cores = SerialParam())}}
#'    \item{bam_files}{A \code{vector} of BAM filenames. The BAM files must be
#'                    indexed. i.e.: if a file is named file.bam, there must
#'                    be a file named file.bam.bai or file.bai in the same 
#'                    directory.}
#'    \item{cores}{The number of cores available to parallelize the analysis.
#'                Either a positive integer or a \code{BiocParallelParam}.
#'                Default: \code{SerialParam()}.}
#'    \item{paired_end}{If \code{TRUE}, metagene will deal with paired-end 
#'                data. If \code{FALSE}, single-end data are expected}
#' }
#'
#' \code{Bam_Handler$new} returns a \code{Bam_Handler} object that contains
#' and manages BAM files. Coverage related information as alignment count can
#' be obtain by using this object.
#'
#' @return
#' \code{Bam_Handler$new} returns a \code{Bam_Handler} object which contains
#' coverage related information for every BAM files.
#'
#' @section Methods:
#' \describe{
#'    \item{}{\code{bh$get_aligned_count(bam_file)}}
#'    \item{bam_file}{The name of the BAM file.}
#' }
#' \describe{
#'    \item{}{\code{bg$get_bam_name(bam_file)}}
#'    \item{bam_file}{The name of the BAM file.}
#' }
#' \describe{
#'    \item{}{\code{bh$get_rpm_coefficient(bam_file)}}
#'    \item{bam_file}{The name of the BAM file.}
#' }
#' \describe{
#'    \item{}{\code{bh$index_bam_files(bam_files)}}
#'    \item{bam_files}{A \code{vector} of BAM filenames.}
#' }
#' \describe{
#'    \item{}{\code{bh$get_bam_files()}}
#' }
#' \describe{
#'    \item{}{\code{bh$get_coverage(bam_file, regions)
#'                                force_seqlevels = FALSE)}}
#'    \item{bam_file}{The name of the BAM file.}
#'    \item{regions}{A not empty \code{GRanges} object.}
#'    \item{force_seqlevels}{If \code{TRUE}, Remove regions that are not found
#'                    in bam file header. Default: \code{FALSE}. TRUE and FALSE
#'                    respectively correspond to pruning.mode = "coarse" 
#'                    and "error" in ?seqinfo.}
#' }
#' \describe{
#'    \item{}{\code{bh$get_normalized_coverage(bam_file, regions)
#'                                force_seqlevels = FALSE)}}
#'    \item{bam_file}{The name of the BAM file.}
#'    \item{regions}{A not empty \code{GRanges} object.}
#'    \item{force_seqlevels}{If \code{TRUE}, Remove regions that are not found
#'                    in bam file header. Default: \code{FALSE}. TRUE and FALSE
#'                    respectively correspond to pruning.mode = "coarse" 
#'                    and "error" in ?seqinfo.}
#' }
#' \describe{
#'    \item{chip_bam_file}{The path to the chip bam file.}
#'    \item{input_bam_file}{The path to the input (control) bam file.}
#' }
#' @examples
#' bam_file <- get_demo_bam_files()[1]
#' bh <- metagene2:::Bam_Handler$new(bam_files = bam_file)
#' bh$get_aligned_count(bam_file)
#'
#' @importFrom R6 R6Class
#' @import GenomicAlignments
#' @import BiocParallel
#' @import Rsamtools 
#' @export
#' @format A BAM manager

Bam_Handler <- R6Class("Bam_Handler",
    public = list(
        parameters = list(),
        initialize = function(bam_files, cores = SerialParam(), 
                              paired_end = FALSE, strand_specific=FALSE,
                              paired_end_strand_mode=2, extend_reads=0,
                              invert_strand=FALSE) {
            # Check prerequisites
            # bam_files must be a vector of BAM filenames
            if (!is.vector(bam_files, "character")) {
                stop("bam_files must be a vector of BAM filenames")
            }

            # All BAM files must exist
            if (!all(vapply(bam_files, file.exists, TRUE))) {
                stop("At least one BAM file does not exist")
            }

            # All BAM files must be indexed
            if (any(is.na(vapply(bam_files, private$get_bam_index_filename, "")))) {
                stop("All BAM files must be indexed")
            }

            # All BAM names must be unique
            if (is.null(names(bam_files))) {
                bam_names <- tools::file_path_sans_ext(basename(bam_files))
            } else {
                bam_names = names(bam_files)
            }
            
            if (length(bam_names) != length(unique(bam_names))) {
                stop("All BAM names must be unique")
            }
            names(bam_files) = bam_names
            
            # Core must be a positive integer or a BiocParallelParam instance
            isBiocParallel = is(cores, "BiocParallelParam")
            isInteger = ((is.numeric(cores) || is.integer(cores)) &&
                            cores > 0 &&as.integer(cores) == cores)
            if (!isBiocParallel && !isInteger) {
                stop(paste0("cores must be a positive numeric or ",
                    "BiocParallelParam instance"))
            }

            # paired_end must be logical
            if (!is.logical(paired_end)) {
                stop("paired_end argument must be logical")
            }    
            
            # Initialize the Bam_Handler object
            private$parallel_job <- Parallel_Job$new(cores)
            self$parameters[["cores"]] <- private$parallel_job$get_core_count()
            self$parameters[["paired_end"]] <- paired_end
            self$parameters[["strand_specific"]] <- strand_specific
            self$parameters[["paired_end_strand_mode"]] <- paired_end_strand_mode
            self$parameters[["extend_reads"]] <- extend_reads
            self$parameters[["invert_strand"]] <- invert_strand
            
            private$bam_files <- data.frame(bam = bam_files,
                                            stringsAsFactors = FALSE)

            private$bam_files[["aligned_count"]] <-
                vapply(private$bam_files[["bam"]], private$get_file_count, 0)
                
            # Check the seqnames
            get_seqnames <- function(bam_file) {
                bam_file <- Rsamtools::BamFile(bam_file)
                GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(bam_file))
            }
            bam_seqnames <- lapply(private$bam_files$bam, get_seqnames)
            all_seqnames <- unlist(bam_seqnames)
            if (!all(table(all_seqnames) == length(bam_seqnames))) {
                msg <- paste0("\n\nSome bam files have discrepancies in their seqnames.\n\n",
                              "This could be caused by chromosome names",
                              " present only in a subset of the bam ",
                              "files (i.e.: chrY in some bam files, but ",
                              "absent in others.\n\n",
                              "This could also be caused by ",
                              "discrepancies in the seqlevels style",
                              " (i.e.: UCSC:chr1 versus NCBI:1)\n\n")
                warning(msg)
            }
        },
        get_bam_name = function(bam_file) {
            bam <- private$bam_files[["bam"]]
            row_names <- rownames(private$bam_files)
            bam_name <- basename(gsub(".bam$", "", bam_file))
            if (bam_file %in% bam) {
                i <- bam == bam_file
                stopifnot(sum(i) == 1)
                row_names[i]
            } else if (basename(bam_file) %in% bam) {
                i <- bam == tools::file_path_sans_ext(bam_file)
                stopifnot(sum(i) == 1)
                row_names[i]
            } else if (bam_name %in% row_names) {
                bam_name
            } else {
                NULL
            }
        },
        get_aligned_count = function(bam_file) {
            # Check prerequisites
            # The bam file must be in the list of bam files used for
            # initialization
            bam_name <- private$check_bam_file(bam_file)
            i <- rownames(private$bam_files) == bam_name
            private$bam_files[["aligned_count"]][i]
        },
        get_rpm_coefficient = function(bam_file) {
            return(self$get_aligned_count(bam_file) / 1000000)
        },
        index_bam_files = function(bam_files) {
            vapply(bam_files, private$index_bam_file, "")
        },
        get_bam_files = function() {
            private$bam_files
        },
        get_coverage = function(bam_file, regions, force_seqlevels = FALSE, simplify=TRUE) {
            private$generic_get_coverage(bam_file, regions, force_seqlevels, simplify=simplify)
        },
        get_normalized_coverage = function(bam_file, regions,
                            force_seqlevels = FALSE, simplify=TRUE) {
            count <- self$get_aligned_count(bam_file)
            private$generic_get_coverage(bam_file, regions, force_seqlevels, count, simplify=simplify)
        }
    ),
    private = list(
        bam_files = data.frame(),
        parallel_job = '',
        check_bam_file = function(bam_file) {
            if (!is.character(bam_file)) {
                stop("bam_file class should be character")
            }
            if (length(bam_file) != 1) {
                stop("bam_file should contain exactly 1 bam filename")
            }
            bam_name <- self$get_bam_name(bam_file)
            if (is.null(bam_name)) {
                stop(paste0("Bam file ", bam_file, " not found."))
            }
            invisible(bam_name)
        },
        check_bam_levels = function(bam_file, regions, force_seqlevels) {
            bam_levels <- GenomeInfoDb::seqlevels(Rsamtools::BamFile(bam_file))
            if (!all(unique(GenomeInfoDb::seqlevels(regions)) %in% bam_levels))
            {
                if (force_seqlevels == FALSE) {
                    stop(SEQ_LEVEL_ERROR)
                } else { #force_seqlevels = TRUE
                    #force_seqlevels is used here but the user interface 
                    #continue to use force_seqlevels an boolean mode
                    GenomeInfoDb::seqlevels(regions, 
                                    pruning.mode = 'coarse') <- bam_levels
                    if (length(regions) == 0) {
                        stop("No seqlevels matching between regions and bam file")
                    }
                }
            }
            regions
        },
        check_bam_length = function(bam_file, regions) {
            bam_infos <- GenomeInfoDb::seqinfo(Rsamtools::BamFile(bam_file))
            grl <- split(regions, as.character(seqnames(regions)))
            gr_max <- vapply(grl, function(x) max(end(x)), numeric(1))
            i <- match(names(gr_max), seqnames(bam_infos))
            if (!all(GenomeInfoDb::seqlengths(bam_infos)[i] >= gr_max)) {
                stop("Some regions are outside max chromosome length")
            }
            regions
        },
        get_bam_index_filename = function(bam_file) {
            # Look for a file where bai is appended (.bam.bai)
            bai_suffix_filename = paste(bam_file, ".bai", sep="")
            if(file.exists(bai_suffix_filename)) {
                return(bai_suffix_filename)
            } else {
                # Look for a file where bai replaces bam (.bai)
                bam_is_bai_filename = gsub("\\.bam$", ".bai", bam_file)
                if(file.exists(bam_is_bai_filename)) {
                    return(bam_is_bai_filename)
                }
            }
            # No index file found, return NA.
            return(NA_character_)
        },
        index_bam_file = function(bam_file) {
            if (is.na(private$get_bam_index_filename(bam_file))) {
                # If there is no index file, we sort and index the current bam
                # file
                # TODO: we need to check if the sorted file was previously
                # produced before doing the costly sort operation
                sorted_bam_file <- paste0(basename(bam_files), ".sorted")
                sortBam(bam_file, sorted_bam_file)
                sorted_bam_file <- paste0(sorted_bam_file, ".bam")
                indexBam(sorted_bam_file)
                bam_file <- sorted_bam_file
            }
            bam_file
        },
        read_bam_files = function(bam_files) {
            if (length(bam_files) > 1) {
                pos <- lapply(bam_files, read.BAM)
                names <- unique(unlist(lapply(pos, names), use.names = FALSE))
                res <- list()
                fetch_pos <- function(name) {
                    res[["-"]] <- lapply(pos, function(x) x[[name]][["-"]])
                    res[["+"]] <- lapply(pos, function(x) x[[name]][["+"]])
                    res[["-"]] <- do.call("c", res[["-"]])
                    res[["+"]] <- do.call("c", res[["+"]])
                    res[["-"]] <- res[["-"]][order(res[["-"]])]
                    res[["+"]] <- res[["+"]][order(res[["+"]])]
                    res
                }
                result <- lapply(names, fetch_pos)
                names(result) <- names
                result
            } else {
                read.BAM(bam_files)
            }
        },
        get_file_count = function(bam_file) {
            sum(Rsamtools::idxstatsBam(bam_file)$mapped)
        },
        prepare_regions = function(regions, bam_file, force_seqlevels) {
            # The regions must be a GRanges object
            if (!is(regions, "GRanges")) {
                stop("Parameter regions must be a GRanges object.")
            }

            # The seqlevels of regions must all be present in bam_file
            regions <- private$check_bam_levels(bam_file, regions,
                            force_seqlevels = force_seqlevels)
            to_remove <- GenomeInfoDb::seqlevels(regions)[!(GenomeInfoDb::seqlevels(regions) %in%
                                            unique(seqnames(regions)))]
            regions <- GenomeInfoDb::dropSeqlevels(regions, to_remove)

            # The regions must not be empty
            if (length(regions) == 0) {
                stop("Parameter regions must not be an empty GRanges object")
            }

            # The seqlevels of regions must all be present in bam_file
            regions <- private$check_bam_levels(bam_file, regions,
                            force_seqlevels = force_seqlevels)

            # The seqlengths of regions must be smaller or eqal to those in
            # bam_file
            regions <- private$check_bam_length(bam_file, regions)

            # The regions must not be overlapping
            reduce(regions)
        },
        read_alignments = function(regions, bam_file, strand=NULL, 
                                   paired_end=FALSE, paired_end_strand_mode=2) {
            # Subset regions according to strand and determine the value
            # passed to scanBamFlag's isMinusStrand.
            if(!is.null(strand)) {
                regions = regions[strand(regions)==strand]
                strand_flag = c('+'=FALSE, '-'=TRUE, '*'=NA)[strand]
                strand_mode = 0
            } else {
                strand_flag = NA
                strand_mode = paired_end_strand_mode
            }
            
            if(length(regions) > 0) {
                # Build a ScanBamParam object using the correct regions
                # and the correct strand.
                scan_flag = Rsamtools:::scanBamFlag(isMinusStrand=strand_flag)
                param <- Rsamtools:::ScanBamParam(which=reduce(regions), flag=scan_flag)
                
                # Read alignments.
                if(!paired_end) {
                    alignment <- GenomicAlignments:::readGAlignments(bam_file, param=param)
                } else {
                    alignment <- GenomicAlignments:::readGAlignmentPairs(bam_file, param=param,
                                                                         strandMode=strand_mode)
                }
            } else {
                # If there are no regions, build an empty alignment object.
                # By default, passing an empty region set to 
                # GenomicAlignments:::readGAlignments returns all alignments.
                alignment <- GenomicAlignments::GAlignments()
                
                # Our empty GAlignments has no seqinfo, which will return an empty 
                # object when calling coverage, rather than a list of all chromosome,
                # each with a 0-valued Rle.
                bam_object = BamFile(bam_file)
                seqinfo(alignment) = seqinfo(bam_object)
            }            
                
            return(alignment)
        },
        extract_coverage_by_regions = function(regions, bam_file, count=NULL, 
                                               paired_end = FALSE,
                                               strand_specific=FALSE,
                                               paired_end_strand_mode=2, extend=0,
                                               invert_strand=FALSE) {
            if(extend > 0) {
                start(regions) = pmax(start(regions)-extend, 1)
                end(regions) = end(regions)+extend
            }
                                               
            if(!strand_specific) {
                alignment = list('+'=NULL, '-'=NULL,
                                 '*'=private$read_alignments(regions, bam_file, paired_end=paired_end,
                                                             paired_end_strand_mode=paired_end_strand_mode))
            } else {
                plus_strand = "+"
                minus_strand = "-"
                if(invert_strand) {
                    regions = invertStrand(regions)
                    plus_strand = "-"
                    minus_strand = "+"
                }
                
                # Read regions on each strand separately.
                alignment = list()
                alignment[[plus_strand]]=private$read_alignments(regions, bam_file, "+",
                                                                 paired_end=paired_end,
                                                                 paired_end_strand_mode=paired_end_strand_mode)
                alignment[[minus_strand]]=private$read_alignments(regions, bam_file, "-",
                                                                  paired_end=paired_end,
                                                                  paired_end_strand_mode=paired_end_strand_mode)
                alignment[['*']]=private$read_alignments(regions, bam_file, '*',
                                                         paired_end=paired_end,
                                                         paired_end_strand_mode=paired_end_strand_mode)
            }
                
            if (!is.null(count)) {
                weight <- 1 / (count / 1000000)
            } else {
                weight <- 1
            }
            
            weighted_coverage <- function(x, extend) {
                if(is.null(x)) {
                    return(x)
                } else { 
                    if(extend==0) {
                        return(GenomicAlignments::coverage(x) * weight)
                    } else {
                        return(GenomicRanges::coverage(GenomicRanges::resize(as(x, "GRanges"), width=extend, fix="start")) * weight)
                    }
                }
            }
            return(lapply(alignment, weighted_coverage, extend=extend))
        },
        generic_get_coverage = function(bam_file, regions, force_seqlevels = FALSE, count=NULL, simplify=TRUE) {
            private$check_bam_file(bam_file)
            regions <- private$prepare_regions(regions, bam_file,
                                                force_seqlevels)
            coverages = private$extract_coverage_by_regions(regions, bam_file, count,
                                paired_end = self$parameters[['paired_end']],
                                strand_specific = self$parameters[['strand_specific']],
                                paired_end_strand_mode = self$parameters[['paired_end_strand_mode']],
                                extend=self$parameters[['extend_reads']],
                                invert_strand=self$parameters[['invert_strand']])
                                
            if(simplify && !self$parameters[["strand_specific"]]) {
                return(coverages[["*"]])
            } else {
                return(coverages)
            }                                
        }
    )
)

SEQ_LEVEL_ERROR = paste0("Some seqlevels of regions are absent in bam_file. ",
                         "This could be caused by 'unknown' chromosomes or by ",
                         "discrepancies in the seqlevels style",
                         " (i.e.: UCSC:chr1 versus NCBI:1)\n\n",
                         "If you are certain your region and bam files ",
                         "match, you can use force_seqlevels=TRUE to ignore this error.")

Try the metagene2 package in your browser

Any scripts or data that you put into this service are public.

metagene2 documentation built on March 19, 2021, 6 p.m.