R/coverage_bwtool.R

Defines functions coverage_bwtool

Documented in coverage_bwtool

#' Given a set of regions and bigwig files, compute the coverage matrix for
#' using \code{bwtool} and build a
#' \link[SummarizedExperiment]{RangedSummarizedExperiment-class} object.
#'
#' Given a set of regions and bigwig files, compute the coverage matrix for
#' using \code{bwtool} and build a
#' \link[SummarizedExperiment]{RangedSummarizedExperiment-class} object.
#'
#' @param bws A named vector with the paths to the bigWig files. The names
#' are used as sample ids.
#' @param regions A \link[GenomicRanges]{GRanges-class} object with regions
#' for which to calculate the coverage matrix.
#' @param strand Either *, + or -. If set to * (default) then all regions are
#' used. Otherwise the matrix is subset to the regions of the corresponding
#' strand. The users should supply the correct corresponding list of bigWig
#' files in \code{bws}.
#' @param pheno \code{NULL} by default. Specify the data.frame with the same
#' length as \code{bws} to be used in the resulting RSE object.
#' @param bwtool The path to \code{bwtool}. Uses as the default the
#' location at JHPCE.
#' @param bpparam A \link[BiocParallel]{BiocParallelParam-class} instance which
#' will be used to calculate the coverage matrix in parallel. By default, 
#' \link[BiocParallel]{SerialParam-class} will be used.
#' @param verbose If \code{TRUE} basic status updates will be printed along the 
#' way.
#' @param sumsdir The path to an existing directory where the \code{bwtool}
#' sum tsv files will be saved. We recommend setting this to a value beyond
#' the default one.
#' @param commands_only If \code{TRUE} the bwtool commands will be saved in a
#' file called coverage_bwtool_strandSTRAND.txt and exit without running
#' \code{bwtool}. This is useful if you have a very large regions set and want
#' to run the commands in an array job. Then run
#' \code{coverage_bwtool(commands_only = FALSE)} to create the RSE
#' object(s).
#' @param overwrite Logical, whether to overwrite output files.
#' @param stranded_sumsdir Logical, whether to automatically add the strand
#' to \code{sumsdir}, to avoid overwriting files from different strands.
#' 
#'
#' @return A \link[SummarizedExperiment]{RangedSummarizedExperiment-class}
#' object with the counts stored in the assays slot. 
#'
#' @details Based on \link{coverage_matrix_bwtool}, but made more general for
#' data outside recount2.
#'
#' @author Leonardo Collado-Torres
#' @export
#'
#' @importFrom methods is
#' @import GenomicRanges BiocParallel rtracklayer SummarizedExperiment S4Vectors
#'
#'
#' @seealso \link[recount.bwtool]{coverage_matrix_bwtool}
#'
#' @examples
#'
#' if(.Platform$OS.type != 'windows') {
#' ## Disable the example for now. I'd have to figure out how to install
#' ## bwtool on travis
#' if(FALSE) {
#'     ## Reading BigWig files is not supported by rtracklayer on Windows
#'     ## (only needed for defining the regions in this example)
#'     
#'     ## TODO
#' }
#' }
#'

coverage_bwtool <- function(bws, regions, strand = '*', pheno = NULL,
    bwtool = '/dcl01/leek/data/bwtool/bwtool-1.0/bwtool',
    bpparam = NULL, verbose = TRUE, sumsdir = tempdir(),
    commands_only = FALSE, overwrite = FALSE, stranded_sumsdir = TRUE) {
        
    ## Check inputs
    stopifnot(!is.null(names(bws)))
    stopifnot(strand %in% c('*', '+', '-'))
    stopifnot(all(file.exists(bws)))
    
    ## Build a pheno if missing
    if(is.null(pheno)) {
        pheno <- data.frame(bigwig_path = bws, bigwig_file = basename(bws),
            sample = names(bws))
    } else {
        stopifnot(nrow(pheno) == length(bws))
    }
    
    if(stranded_sumsdir) {
        sumsdir <- paste0(sumsdir, '_', ifelse(strand == '*', 'unstranded',
            ifelse(strand == '+', 'strand_positive', 'strand_minus')))
        message(paste(Sys.time(), 'switched sumsdir to', sumsdir))
    }
    dir.create(sumsdir, recursive = TRUE, showWarnings = FALSE)
    
    ##  Subset regions by strand if needed
    if(strand != '*') {
        regions <- regions[strand(regions) == strand]
    }
    
    ## Export regions to a BED file if necessary
    bed <- file.path(sumsdir, paste0('coverage_bwtool_strand', strand, '_',
        Sys.Date(), '.bed'))
    
    if(!file.exists(bed) || overwrite) {
        if (verbose) message(paste(Sys.time(), 'creating the BED file', bed))
        tmpreg <- regions
        strand(tmpreg) <- '*'
        rtracklayer::export(tmpreg, con = bed, format = 'BED')
        rm(tmpreg)
        stopifnot(file.exists(bed))
    }
    
    ## Define bpparam
    if(is.null(bpparam)) bpparam <- BiocParallel::SerialParam()
    
    ## Run bwtool and load the data
    counts <- bpmapply(.run_bwtool, bws, names(bws),
        MoreArgs = list('bwtool' = bwtool, 'bed' = bed, 'sumsdir' = sumsdir,
        'verbose' = verbose, 'commands_only' = commands_only,
        'overwrite' = overwrite),
        SIMPLIFY = FALSE, BPPARAM = bpparam)
    if(commands_only) {
        commands <- unlist(counts)
        cat(commands, file = paste0('coverage_bwtool_strand', strand,
            '.txt'), sep = '\n')
        return(invisible(NULL))
    }
    
    ## Group results from all files
    counts <- do.call(cbind, counts)

    ## Build a RSE object
    rse <- SummarizedExperiment(assays = list('counts' = counts),
            colData = DataFrame(pheno), rowRanges = regions)
            
    ## Finish
    return(rse)
}
LieberInstitute/recount.bwtool documentation built on Feb. 7, 2020, 3:53 p.m.