#' Create a mixNBHMMDataSet from a set of BAM files
#'
#' This function creates a (RangedSummarizedExperiment) mixNBHMMDataSet object out of a set of BAM files.
#' It assumes that the BAM files have been through all the necessary pre-processing steps for ChIP-seq data analysis,
#' such as low quality read filtering and PCR duplicate removal.
#'
#' @param bamFiles a set of paths from sorted and indexed BAM files. The index files must be located in the same directory of their respective bam files.
#' The index files must be named like their respective bam file with the additional “.bai” suffix
#' @param colData a data frame with the information about the bamFiles. It should contain two named columns \code{Condition} and \code{Replicate}
#' referring to the experimental conditions and their replicate identifiers.
#' @param genome either 'hg19' or a GRanges object with the chromosome lengths of the reference genome (see details)
#' @param blackList either 'hg19', NULL, or a GRanges object with the genomic coordinates to be removed (see details). If NULL, no genomic regions will be discarded.
#' @param fragLength either a vector of estimated fragment lengths relative to the bamFiles experiments, or NULL (default). If NULL, it will be estimated using csaw cross-correlation analysis with default parameters.
#' @param windowSize an integer specifying the size of genomic windows where read counts will be computed
#'
#' @details
#'
#' If \code{genome='hg19'}, this function will compute read counts using the reference genome hg19 from the package \code{BSgenome.Hsapiens.UCSC.hg19} on chromosomes 1,...,22,X,Y in common across all experiments.
#' Otherwise, \code{genome} should be a GRanges object with chromosome names (\code{seqnames}), start positions (\code{start}), and end positions (\code{end}).
#'
#' If \code{blackList='hg19'}, this function will exclude genomic coordinates pertaining to blacklisted regions defined in
#' https://github.com/Boyle-Lab/Blacklist/tree/master/lists. See https://sites.google.com/site/anshulkundaje/projects/blacklists for a detailed discussion and recommendations.
#'
#'
#' @return A (RangedSummarizedExperiment) mixNBHMMDataSet
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @references
#' Nucleic Acids Research 44, e45 (2015).
#'
#' @import BSgenome.Hsapiens.UCSC.hg19 bamsignals Rsamtools GenomeInfoDb csaw
#' @rawNamespace import(SummarizedExperiment, except = shift)
#' @rawNamespace import(IRanges, except = shift)
#' @rawNamespace import(GenomicRanges, except = shift)
#'
#' @export
mixNBHMMDataSetFromBam <- function(bamFiles,colData,genome,blackList,windowSize,fragLength = NULL){
hg19.blacklist.v2 = NULL
# Checking dimensions
if(!(nrow(colData)==length(bamFiles)) | !all(c('Condition','Replicate')%in%names(colData))){
stop('The argument colData must be a data frame with proper dimensions.')
}
if(!is.null(fragLength)){
if(!(length(fragLength)==length(bamFiles)) | !all(is.numeric(fragLength))){
stop('The argument fragLength must be a numeric vector.')
}
}
# Checking whether windowSize is an integer
if(is.numeric(windowSize)){
if(!windowSize%%1==0){
stop('The argument windowSize must be an integer number')
}
} else{
stop('The argument windowSize must be an integer number')
}
# Setting up reference genome
if(class(genome)=="GRanges"){
gr.genome <- genome
} else{
if(genome=='hg19'){
chrlist <- Reduce(intersect,lapply(bamFiles,FUN = function(x){unique(as.character(Rsamtools::scanBam(x)[[1]]$rname))}))
gr.genome <- GenomicRanges::GRanges(chrlist,IRanges::IRanges(1,GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg19::Hsapiens)[chrlist]))
GenomeInfoDb::seqlevels(gr.genome) <- paste0('chr',c(1:22,'X','Y'))
gr.genome <- sort(gr.genome)
} else{
stop('The argument genome must be either "hg19" or a "GRanges" object')
}
}
# Setting up blacklisted regions
if(!is.null(blackList)){
if(class(blackList)=="GRanges"){
gr.blackList <- blackList
} else{
if(blackList=='hg19'){
gr.blackList <- mixNBHMM::hg19.blacklist.v2
} else{
stop('The argument blackList must be either "hg19" or a "GRanges" object')
}
}
# Cleaning up gr.genome
gr.genome <- suppressWarnings(GenomicRanges::setdiff(gr.genome,gr.blackList))
param <- csaw::readParam(discard=gr.blackList)
} else{
param <- csaw::readParam()
}
# Tiling up the genome
gr.genome = unlist(GenomicRanges::tile(gr.genome,width=windowSize))
# Estimating the fragment length and computing read counts
readlist <- lapply(1:length(bamFiles),FUN = function(i){
if(is.null(fragLength)){
frag <- csaw::maximizeCcf(csaw::correlateReads(bam.files = bamFiles[i],param=param))
} else{
frag <- fragLength[i]
}
counts <- bamsignals::bamCount(bampath = bamFiles[i],gr = gr.genome,verbose = F,shift = frag/2)
return(counts)
})
mixNBHMMDataSet <- SummarizedExperiment::SummarizedExperiment(assays = list(counts = matrix(unlist(readlist),byrow = F,nrow = length(gr.genome),ncol = length(bamFiles),dimnames = list(NULL,paste(colData$Condition,colData$Replicate,sep='.')))),
rowRanges = gr.genome,
colData = colData)
return(mixNBHMMDataSet)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.