R/mixHMMDataSetFromBam.R

Defines functions mixHMMDataSetFromBam

Documented in mixHMMDataSetFromBam

#' Create a mixHMMDataSet from a set of BAM files
#'
#' This function creates a (RangedSummarizedExperiment) mixHMMDataSet object out of a set of BAM files.
#' It assumes that the BAM files have been through all the necessary pre-processing steps for ChIP-seq data analysis,
#' such as low quality read filtering and PCR duplicate removal.
#'
#' @param bamFiles a set of paths from sorted and indexed BAM files. The index files must be located in the same directory of their respective bam files.
#' The index files must be named like their respective bam file with the additional “.bai” suffix
#' @param colData a data frame with the information about the bamFiles. It should contain two named columns \code{Condition} and \code{Replicate}
#' referring to the experimental conditions and their replicate identifiers.
#' @param genome either 'hg19' or a GRanges object with the chromosome lengths of the reference genome (see details)
#' @param blackList either 'hg19', NULL, or a GRanges object with the genomic coordinates to be removed (see details). If NULL, no genomic regions will be discarded.
#' @param fragLength either a vector of estimated fragment lengths relative to the bamFiles experiments, or NULL. If NULL, it will be estimated using csaw cross-correlation analysis with default parameters.
#' @param windowSize an integer specifying the size of genomic windows where read counts will be computed
#'
#' @details
#'
#' If \code{genome='hg19'}, this function will compute read counts using the reference genome hg19 from the package \code{BSgenome.Hsapiens.UCSC.hg19} on chromosomes 1,...,22,X,Y in common across all experiments.
#' Otherwise, \code{genome} should be a GRanges object with chromosome names (\code{seqnames}), start positions (\code{start}), and end positions (\code{end}).
#'
#' If \code{blackList='hg19'}, this function will exclude genomic coordinates pertaining to blacklisted regions defined in
#' https://github.com/Boyle-Lab/Blacklist/tree/master/lists. See https://sites.google.com/site/anshulkundaje/projects/blacklists for a detailed discussion and recommendations.
#'
#'
#' @return A (RangedSummarizedExperiment) mixHMMDataSet
#'
#' @author Pedro L. Baldoni, \email{pedrobaldoni@gmail.com}
#'
#' @references
#' Nucleic Acids Research 44, e45 (2015).
#'
#' @import BSgenome.Hsapiens.UCSC.hg19 bamsignals Rsamtools GenomeInfoDb GenomicRanges IRanges csaw SummarizedExperiment S4Vectors
#'
#' @export
mixHMMDataSetFromBam <- function(bamFiles,colData,genome,blackList,fragLength,windowSize){

    # Checking dimensions
    if(!(nrow(colData)==length(bamFiles)) | !(ncol(colData)==2) | !all(names(colData)%in%c('Condition','Replicate'))){
        stop('The argument colData must be a data frame with proper dimensions.')
    }

    if(!is.null(fragLength)){
        if(!(length(fragLength)==length(bamFiles)) | !all(is.numeric(fragLength))){
            stop('The argument fragLength must be a numeric vector.')
        }
    }

    # Checking whether windowSize is an integer
    if(is.numeric(windowSize)){
        if(!windowSize%%1==0){
            stop('The argument windowSize must be an integer number')
        }
    } else{
        stop('The argument windowSize must be an integer number')
    }

    # Setting up reference genome
    if(class(genome)=="GRanges"){
        gr.genome <- genome
    } else{
        if(genome=='hg19'){
            chrlist <- Reduce(intersect,lapply(bamFiles,FUN = function(x){GenomeInfoDb::seqnames(GenomeInfoDb::seqinfo(Rsamtools::BamFile(x)))}))
            gr.genome <- GenomicRanges::GRanges(chrlist,IRanges::IRanges(1,GenomeInfoDb::seqlengths(BSgenome.Hsapiens.UCSC.hg19::Hsapiens)[chrlist]))
            GenomeInfoDb::seqlevels(gr.genome) <- paste0('chr',c(1:22,'X','Y'))
            gr.genome <- sort(gr.genome)
        } else{
            stop('The argument genome must be either "hg19" or a "GRanges" object')
        }
    }

    # Setting up blacklisted regions
    if(!is.null(blackList)){
        if(class(blackList)=="GRanges"){
            gr.blackList <- blackList
        } else{
            if(blackList=='hg19'){
                gr.blackList <- hg19.blacklist.v2
            } else{
                stop('The argument blackList must be either "hg19" or a "GRanges" object')
            }
        }
        # Cleaning up gr.genome
        gr.genome <- suppressWarnings(GenomicRanges::setdiff(gr.genome,gr.blackList))
        param <- csaw::readParam(discard=gr.blackList)
    } else{
        param <- csaw::readParam()
    }

    # Tiling up the genome
    gr.genome = unlist(GenomicRanges::tile(gr.genome,width=windowSize))

    # Estimating the fragment length and computing read counts
    readlist <- lapply(1:length(bamFiles),FUN = function(i){
        if(is.null(fragLength)){
            frag <- csaw::maximizeCcf(csaw::correlateReads(bam.files = bamFiles[i],param=param))
        } else{
            frag <- fragLength[i]
        }
        counts <- bamsignals::bamCount(bampath = bamFiles[i],gr = gr.genome,verbose = F,shift = frag/2)
        return(counts)
    })

    mixHMMDataSet <- SummarizedExperiment::SummarizedExperiment(assays = S4Vectors::SimpleList(counts = matrix(unlist(readlist),byrow = F,nrow = length(gr.genome),ncol = length(bamFiles))),
                                                                rowRanges = gr.genome,
                                                                colData = colData)

    return(mixHMMDataSet)
}
plbaldoni/mixHMM documentation built on Nov. 8, 2019, 8:05 p.m.