Nothing
#' A class to manage BAM files.
#'
#' This class will allow to load, convert and normalize alignments and regions
#' files/data.
#'
#' @section Constructor:
#' \describe{
#' \item{}{\code{bh <- Bam_Handler$new(bam_files, cores = SerialParam())}}
#' \item{bam_files}{A \code{vector} of BAM filenames. The BAM files must be
#' indexed. i.e.: if a file is named file.bam, there must
#' be a file named file.bam.bai or file.bai in the same
#' directory.}
#' \item{cores}{The number of cores available to parallelize the analysis.
#' Either a positive integer or a \code{BiocParallelParam}.
#' Default: \code{SerialParam()}.}
#' \item{paired_end}{If \code{TRUE}, metagene will deal with paired-end
#' data. If \code{FALSE}, single-end data are expected}
#' }
#'
#' \code{Bam_Handler$new} returns a \code{Bam_Handler} object that contains
#' and manages BAM files. Coverage related information as alignment count can
#' be obtain by using this object.
#'
#' @return
#' \code{Bam_Handler$new} returns a \code{Bam_Handler} object which contains
#' coverage related information for every BAM files.
#'
#' @section Methods:
#' \describe{
#' \item{}{\code{bh$get_aligned_count(bam_file)}}
#' \item{bam_file}{The name of the BAM file.}
#' }
#' \describe{
#' \item{}{\code{bg$get_bam_name(bam_file)}}
#' \item{bam_file}{The name of the BAM file.}
#' }
#' \describe{
#' \item{}{\code{bh$get_rpm_coefficient(bam_file)}}
#' \item{bam_file}{The name of the BAM file.}
#' }
#' \describe{
#' \item{}{\code{bh$index_bam_files(bam_files)}}
#' \item{bam_files}{A \code{vector} of BAM filenames.}
#' }
#' \describe{
#' \item{}{\code{bh$get_bam_files()}}
#' }
#' \describe{
#' \item{}{\code{bh$get_coverage(bam_file, regions)
#' force_seqlevels = FALSE)}}
#' \item{bam_file}{The name of the BAM file.}
#' \item{regions}{A not empty \code{GRanges} object.}
#' \item{force_seqlevels}{If \code{TRUE}, Remove regions that are not found
#' in bam file header. Default: \code{FALSE}. TRUE and FALSE
#' respectively correspond to pruning.mode = "coarse"
#' and "error" in ?seqinfo.}
#' }
#' \describe{
#' \item{}{\code{bh$get_normalized_coverage(bam_file, regions)
#' force_seqlevels = FALSE)}}
#' \item{bam_file}{The name of the BAM file.}
#' \item{regions}{A not empty \code{GRanges} object.}
#' \item{force_seqlevels}{If \code{TRUE}, Remove regions that are not found
#' in bam file header. Default: \code{FALSE}. TRUE and FALSE
#' respectively correspond to pruning.mode = "coarse"
#' and "error" in ?seqinfo.}
#' }
#' \describe{
#' \item{chip_bam_file}{The path to the chip bam file.}
#' \item{input_bam_file}{The path to the input (control) bam file.}
#' }
#' @examples
#' bam_file <- get_demo_bam_files()[1]
#' bh <- metagene2:::Bam_Handler$new(bam_files = bam_file)
#' bh$get_aligned_count(bam_file)
#'
#' @importFrom R6 R6Class
#' @import GenomicAlignments
#' @import BiocParallel
#' @import Rsamtools
#' @export
#' @format A BAM manager
Bam_Handler <- R6Class("Bam_Handler",
public = list(
parameters = list(),
initialize = function(bam_files, cores = SerialParam(),
paired_end = FALSE, strand_specific=FALSE,
paired_end_strand_mode=2, extend_reads=0,
invert_strand=FALSE) {
# Check prerequisites
# bam_files must be a vector of BAM filenames
if (!is.vector(bam_files, "character")) {
stop("bam_files must be a vector of BAM filenames")
}
# All BAM files must exist
if (!all(vapply(bam_files, file.exists, TRUE))) {
stop("At least one BAM file does not exist")
}
# All BAM files must be indexed
if (any(is.na(vapply(bam_files, private$get_bam_index_filename, "")))) {
stop("All BAM files must be indexed")
}
# All BAM names must be unique
if (is.null(names(bam_files))) {
bam_names <- tools::file_path_sans_ext(basename(bam_files))
} else {
bam_names = names(bam_files)
}
if (length(bam_names) != length(unique(bam_names))) {
stop("All BAM names must be unique")
}
names(bam_files) = bam_names
# Core must be a positive integer or a BiocParallelParam instance
isBiocParallel = is(cores, "BiocParallelParam")
isInteger = ((is.numeric(cores) || is.integer(cores)) &&
cores > 0 &&as.integer(cores) == cores)
if (!isBiocParallel && !isInteger) {
stop(paste0("cores must be a positive numeric or ",
"BiocParallelParam instance"))
}
# paired_end must be logical
if (!is.logical(paired_end)) {
stop("paired_end argument must be logical")
}
# Initialize the Bam_Handler object
private$parallel_job <- Parallel_Job$new(cores)
self$parameters[["cores"]] <- private$parallel_job$get_core_count()
self$parameters[["paired_end"]] <- paired_end
self$parameters[["strand_specific"]] <- strand_specific
self$parameters[["paired_end_strand_mode"]] <- paired_end_strand_mode
self$parameters[["extend_reads"]] <- extend_reads
self$parameters[["invert_strand"]] <- invert_strand
private$bam_files <- data.frame(bam = bam_files,
stringsAsFactors = FALSE)
private$bam_files[["aligned_count"]] <-
vapply(private$bam_files[["bam"]], private$get_file_count, 0)
# Check the seqnames
get_seqnames <- function(bam_file) {
bam_file <- Rsamtools::BamFile(bam_file)
GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(bam_file))
}
bam_seqnames <- lapply(private$bam_files$bam, get_seqnames)
all_seqnames <- unlist(bam_seqnames)
if (!all(table(all_seqnames) == length(bam_seqnames))) {
msg <- paste0("\n\nSome bam files have discrepancies in their seqnames.\n\n",
"This could be caused by chromosome names",
" present only in a subset of the bam ",
"files (i.e.: chrY in some bam files, but ",
"absent in others.\n\n",
"This could also be caused by ",
"discrepancies in the seqlevels style",
" (i.e.: UCSC:chr1 versus NCBI:1)\n\n")
warning(msg)
}
},
get_bam_name = function(bam_file) {
bam <- private$bam_files[["bam"]]
row_names <- rownames(private$bam_files)
bam_name <- basename(gsub(".bam$", "", bam_file))
if (bam_file %in% bam) {
i <- bam == bam_file
stopifnot(sum(i) == 1)
row_names[i]
} else if (basename(bam_file) %in% bam) {
i <- bam == tools::file_path_sans_ext(bam_file)
stopifnot(sum(i) == 1)
row_names[i]
} else if (bam_name %in% row_names) {
bam_name
} else {
NULL
}
},
get_aligned_count = function(bam_file) {
# Check prerequisites
# The bam file must be in the list of bam files used for
# initialization
bam_name <- private$check_bam_file(bam_file)
i <- rownames(private$bam_files) == bam_name
private$bam_files[["aligned_count"]][i]
},
get_rpm_coefficient = function(bam_file) {
return(self$get_aligned_count(bam_file) / 1000000)
},
index_bam_files = function(bam_files) {
vapply(bam_files, private$index_bam_file, "")
},
get_bam_files = function() {
private$bam_files
},
get_coverage = function(bam_file, regions, force_seqlevels = FALSE, simplify=TRUE) {
private$generic_get_coverage(bam_file, regions, force_seqlevels, simplify=simplify)
},
get_normalized_coverage = function(bam_file, regions,
force_seqlevels = FALSE, simplify=TRUE) {
count <- self$get_aligned_count(bam_file)
private$generic_get_coverage(bam_file, regions, force_seqlevels, count, simplify=simplify)
}
),
private = list(
bam_files = data.frame(),
parallel_job = '',
check_bam_file = function(bam_file) {
if (!is.character(bam_file)) {
stop("bam_file class should be character")
}
if (length(bam_file) != 1) {
stop("bam_file should contain exactly 1 bam filename")
}
bam_name <- self$get_bam_name(bam_file)
if (is.null(bam_name)) {
stop(paste0("Bam file ", bam_file, " not found."))
}
invisible(bam_name)
},
check_bam_levels = function(bam_file, regions, force_seqlevels) {
bam_levels <- GenomeInfoDb::seqlevels(Rsamtools::BamFile(bam_file))
if (!all(unique(GenomeInfoDb::seqlevels(regions)) %in% bam_levels))
{
if (force_seqlevels == FALSE) {
stop(SEQ_LEVEL_ERROR)
} else { #force_seqlevels = TRUE
#force_seqlevels is used here but the user interface
#continue to use force_seqlevels an boolean mode
GenomeInfoDb::seqlevels(regions,
pruning.mode = 'coarse') <- bam_levels
if (length(regions) == 0) {
stop("No seqlevels matching between regions and bam file")
}
}
}
regions
},
check_bam_length = function(bam_file, regions) {
bam_infos <- GenomeInfoDb::seqinfo(Rsamtools::BamFile(bam_file))
grl <- split(regions, as.character(seqnames(regions)))
gr_max <- vapply(grl, function(x) max(end(x)), numeric(1))
i <- match(names(gr_max), seqnames(bam_infos))
if (!all(GenomeInfoDb::seqlengths(bam_infos)[i] >= gr_max)) {
stop("Some regions are outside max chromosome length")
}
regions
},
get_bam_index_filename = function(bam_file) {
# Look for a file where bai is appended (.bam.bai)
bai_suffix_filename = paste(bam_file, ".bai", sep="")
if(file.exists(bai_suffix_filename)) {
return(bai_suffix_filename)
} else {
# Look for a file where bai replaces bam (.bai)
bam_is_bai_filename = gsub("\\.bam$", ".bai", bam_file)
if(file.exists(bam_is_bai_filename)) {
return(bam_is_bai_filename)
}
}
# No index file found, return NA.
return(NA_character_)
},
index_bam_file = function(bam_file) {
if (is.na(private$get_bam_index_filename(bam_file))) {
# If there is no index file, we sort and index the current bam
# file
# TODO: we need to check if the sorted file was previously
# produced before doing the costly sort operation
sorted_bam_file <- paste0(basename(bam_files), ".sorted")
sortBam(bam_file, sorted_bam_file)
sorted_bam_file <- paste0(sorted_bam_file, ".bam")
indexBam(sorted_bam_file)
bam_file <- sorted_bam_file
}
bam_file
},
read_bam_files = function(bam_files) {
if (length(bam_files) > 1) {
pos <- lapply(bam_files, read.BAM)
names <- unique(unlist(lapply(pos, names), use.names = FALSE))
res <- list()
fetch_pos <- function(name) {
res[["-"]] <- lapply(pos, function(x) x[[name]][["-"]])
res[["+"]] <- lapply(pos, function(x) x[[name]][["+"]])
res[["-"]] <- do.call("c", res[["-"]])
res[["+"]] <- do.call("c", res[["+"]])
res[["-"]] <- res[["-"]][order(res[["-"]])]
res[["+"]] <- res[["+"]][order(res[["+"]])]
res
}
result <- lapply(names, fetch_pos)
names(result) <- names
result
} else {
read.BAM(bam_files)
}
},
get_file_count = function(bam_file) {
sum(Rsamtools::idxstatsBam(bam_file)$mapped)
},
prepare_regions = function(regions, bam_file, force_seqlevels) {
# The regions must be a GRanges object
if (!is(regions, "GRanges")) {
stop("Parameter regions must be a GRanges object.")
}
# The seqlevels of regions must all be present in bam_file
regions <- private$check_bam_levels(bam_file, regions,
force_seqlevels = force_seqlevels)
to_remove <- GenomeInfoDb::seqlevels(regions)[!(GenomeInfoDb::seqlevels(regions) %in%
unique(seqnames(regions)))]
regions <- GenomeInfoDb::dropSeqlevels(regions, to_remove)
# The regions must not be empty
if (length(regions) == 0) {
stop("Parameter regions must not be an empty GRanges object")
}
# The seqlevels of regions must all be present in bam_file
regions <- private$check_bam_levels(bam_file, regions,
force_seqlevels = force_seqlevels)
# The seqlengths of regions must be smaller or eqal to those in
# bam_file
regions <- private$check_bam_length(bam_file, regions)
# The regions must not be overlapping
reduce(regions)
},
read_alignments = function(regions, bam_file, strand=NULL,
paired_end=FALSE, paired_end_strand_mode=2) {
# Subset regions according to strand and determine the value
# passed to scanBamFlag's isMinusStrand.
if(!is.null(strand)) {
regions = regions[strand(regions)==strand]
strand_flag = c('+'=FALSE, '-'=TRUE, '*'=NA)[strand]
strand_mode = 0
} else {
strand_flag = NA
strand_mode = paired_end_strand_mode
}
if(length(regions) > 0) {
# Build a ScanBamParam object using the correct regions
# and the correct strand.
scan_flag = Rsamtools:::scanBamFlag(isMinusStrand=strand_flag)
param <- Rsamtools:::ScanBamParam(which=reduce(regions), flag=scan_flag)
# Read alignments.
if(!paired_end) {
alignment <- GenomicAlignments:::readGAlignments(bam_file, param=param)
} else {
alignment <- GenomicAlignments:::readGAlignmentPairs(bam_file, param=param,
strandMode=strand_mode)
}
} else {
# If there are no regions, build an empty alignment object.
# By default, passing an empty region set to
# GenomicAlignments:::readGAlignments returns all alignments.
alignment <- GenomicAlignments::GAlignments()
# Our empty GAlignments has no seqinfo, which will return an empty
# object when calling coverage, rather than a list of all chromosome,
# each with a 0-valued Rle.
bam_object = BamFile(bam_file)
seqinfo(alignment) = seqinfo(bam_object)
}
return(alignment)
},
extract_coverage_by_regions = function(regions, bam_file, count=NULL,
paired_end = FALSE,
strand_specific=FALSE,
paired_end_strand_mode=2, extend=0,
invert_strand=FALSE) {
if(extend > 0) {
start(regions) = pmax(start(regions)-extend, 1)
end(regions) = end(regions)+extend
}
if(!strand_specific) {
alignment = list('+'=NULL, '-'=NULL,
'*'=private$read_alignments(regions, bam_file, paired_end=paired_end,
paired_end_strand_mode=paired_end_strand_mode))
} else {
plus_strand = "+"
minus_strand = "-"
if(invert_strand) {
regions = invertStrand(regions)
plus_strand = "-"
minus_strand = "+"
}
# Read regions on each strand separately.
alignment = list()
alignment[[plus_strand]]=private$read_alignments(regions, bam_file, "+",
paired_end=paired_end,
paired_end_strand_mode=paired_end_strand_mode)
alignment[[minus_strand]]=private$read_alignments(regions, bam_file, "-",
paired_end=paired_end,
paired_end_strand_mode=paired_end_strand_mode)
alignment[['*']]=private$read_alignments(regions, bam_file, '*',
paired_end=paired_end,
paired_end_strand_mode=paired_end_strand_mode)
}
if (!is.null(count)) {
weight <- 1 / (count / 1000000)
} else {
weight <- 1
}
weighted_coverage <- function(x, extend) {
if(is.null(x)) {
return(x)
} else {
if(extend==0) {
return(GenomicAlignments::coverage(x) * weight)
} else {
return(GenomicRanges::coverage(GenomicRanges::resize(as(x, "GRanges"), width=extend, fix="start")) * weight)
}
}
}
return(lapply(alignment, weighted_coverage, extend=extend))
},
generic_get_coverage = function(bam_file, regions, force_seqlevels = FALSE, count=NULL, simplify=TRUE) {
private$check_bam_file(bam_file)
regions <- private$prepare_regions(regions, bam_file,
force_seqlevels)
coverages = private$extract_coverage_by_regions(regions, bam_file, count,
paired_end = self$parameters[['paired_end']],
strand_specific = self$parameters[['strand_specific']],
paired_end_strand_mode = self$parameters[['paired_end_strand_mode']],
extend=self$parameters[['extend_reads']],
invert_strand=self$parameters[['invert_strand']])
if(simplify && !self$parameters[["strand_specific"]]) {
return(coverages[["*"]])
} else {
return(coverages)
}
}
)
)
SEQ_LEVEL_ERROR = paste0("Some seqlevels of regions are absent in bam_file. ",
"This could be caused by 'unknown' chromosomes or by ",
"discrepancies in the seqlevels style",
" (i.e.: UCSC:chr1 versus NCBI:1)\n\n",
"If you are certain your region and bam files ",
"match, you can use force_seqlevels=TRUE to ignore this error.")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.