R/mixNBHMMDataSetFromBam.R

Defines functions mixNBHMMDataSetFromBam

Documented in mixNBHMMDataSetFromBam

#' Create a mixNBHMMDataSet from a set of BAM files
#'
#' This function creates a (RangedSummarizedExperiment) mixNBHMMDataSet object out of a set of BAM files.
#' It assumes that the BAM files have been through all the necessary pre-processing steps for ChIP-seq data analysis,
#' such as low quality read filtering and PCR duplicate removal.
#'
#' @param bamFiles a set of paths from sorted and indexed BAM files. The index files must be located in the same directory of their respective bam files.
#' The index files must be named like their respective bam file with the additional “.bai” suffix
#' @param colData a data frame with the information about the bamFiles. It should contain two named columns \code{Condition} and \code{Replicate}
#' referring to the experimental conditions and their replicate identifiers.
#' @param genome either 'hg19' or a GRanges object with the chromosome lengths of the reference genome (see details)
#' @param blackList either 'hg19', NULL, or a GRanges object with the genomic coordinates to be removed (see details). If NULL, no genomic regions will be discarded.
#' @param fragLength either a vector of estimated fragment lengths relative to the bamFiles experiments, or NULL (default). If NULL, it will be estimated using csaw cross-correlation analysis with default parameters.
#' @param windowSize an integer specifying the size of genomic windows where read counts will be computed
#'
#' @details
#'
#' If \code{genome='hg19'}, this function will compute read counts using the reference genome hg19 from the package \code{BSgenome.Hsapiens.UCSC.hg19} on chromosomes 1,...,22,X,Y in common across all experiments.
#' Otherwise, \code{genome} should be a GRanges object with chromosome names (\code{seqnames}), start positions (\code{start}), and end positions (\code{end}).
#'
#' If \code{blackList='hg19'}, this function will exclude genomic coordinates pertaining to blacklisted regions defined in
#' https://github.com/Boyle-Lab/Blacklist/tree/master/lists. See https://sites.google.com/site/anshulkundaje/projects/blacklists for a detailed discussion and recommendations.
#'
#'
#' @return A (RangedSummarizedExperiment) mixNBHMMDataSet
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @references
#' Nucleic Acids Research 44, e45 (2015).
#'
#' @import BSgenome.Hsapiens.UCSC.hg19 bamsignals Rsamtools GenomeInfoDb csaw
#' @rawNamespace import(SummarizedExperiment, except = shift)
#' @rawNamespace import(IRanges, except = shift)
#' @rawNamespace import(GenomicRanges, except = shift)
#'
#' @export
mixNBHMMDataSetFromBam <- function(bamFiles,colData,genome,blackList,windowSize,fragLength = NULL){
    hg19.blacklist.v2 = NULL

    # Checking dimensions
    if(!(nrow(colData)==length(bamFiles)) | !all(c('Condition','Replicate')%in%names(colData))){
        stop('The argument colData must be a data frame with proper dimensions.')
    }

    if(!is.null(fragLength)){
        if(!(length(fragLength)==length(bamFiles)) | !all(is.numeric(fragLength))){
            stop('The argument fragLength must be a numeric vector.')
        }
    }

    # Checking whether windowSize is an integer
    if(is.numeric(windowSize)){
        if(!windowSize%%1==0){
            stop('The argument windowSize must be an integer number')
        }
    } else{
        stop('The argument windowSize must be an integer number')
    }

    # Setting up reference genome
    if(class(genome)=="GRanges"){
        gr.genome <- genome
    } else{
        if(genome=='hg19'){
            chrlist <- Reduce(intersect,lapply(bamFiles,FUN = function(x){unique(as.character(Rsamtools::scanBam(x)[[1]]$rname))}))
            gr.genome <- GenomicRanges::GRanges(chrlist,IRanges::IRanges(1,GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg19::Hsapiens)[chrlist]))
            GenomeInfoDb::seqlevels(gr.genome) <- paste0('chr',c(1:22,'X','Y'))
            gr.genome <- sort(gr.genome)
        } else{
            stop('The argument genome must be either "hg19" or a "GRanges" object')
        }
    }

    # Setting up blacklisted regions
    if(!is.null(blackList)){
        if(class(blackList)=="GRanges"){
            gr.blackList <- blackList
        } else{
            if(blackList=='hg19'){
                gr.blackList <- mixNBHMM::hg19.blacklist.v2
            } else{
                stop('The argument blackList must be either "hg19" or a "GRanges" object')
            }
        }
        # Cleaning up gr.genome
        gr.genome <- suppressWarnings(GenomicRanges::setdiff(gr.genome,gr.blackList))
        param <- csaw::readParam(discard=gr.blackList)
    } else{
        param <- csaw::readParam()
    }

    # Tiling up the genome
    gr.genome = unlist(GenomicRanges::tile(gr.genome,width=windowSize))

    # Estimating the fragment length and computing read counts
    readlist <- lapply(1:length(bamFiles),FUN = function(i){
        if(is.null(fragLength)){
            frag <- csaw::maximizeCcf(csaw::correlateReads(bam.files = bamFiles[i],param=param))
        } else{
            frag <- fragLength[i]
        }
        counts <- bamsignals::bamCount(bampath = bamFiles[i],gr = gr.genome,verbose = F,shift = frag/2)
        return(counts)
    })

    mixNBHMMDataSet <- SummarizedExperiment::SummarizedExperiment(assays = list(counts = matrix(unlist(readlist),byrow = F,nrow = length(gr.genome),ncol = length(bamFiles),dimnames = list(NULL,paste(colData$Condition,colData$Replicate,sep='.')))),
                                                                rowRanges = gr.genome,
                                                                colData = colData)

    return(mixNBHMMDataSet)
}
plbaldoni/mixNBHMM documentation built on Dec. 24, 2019, 1:31 p.m.