R/mergeResults.R

Defines functions calculateFWER mergeResults

Documented in mergeResults

#' Merge results from different chromosomes
#'
#' This function merges the results from running \link{analyzeChr} on several 
#' chromosomes and assigns genomic states using \link{annotateRegions}. It 
#' re-calculates the p-values and q-values using the pooled areas from the null 
#' regions from all chromosomes. Once the results have been merged, 
#' \code{derfinderReport::generateReport} can be used to generate an HTML 
#' report of the results. The \code{derfinderReport} package is available at 
#' https://github.com/lcolladotor/derfinderReport.
#' 
#' @param chrs The chromosomes of the files to be merged.
#' @param prefix The main data directory path, which can be useful if 
#' \link{analyzeChr} is used for several parameters and the results are saved 
#' in different directories.
#' @param significantCut A vector of length two specifiying the cutoffs used to 
#' determine significance. The first element is used to determine significance 
#' for the P-values and FWER adjusted P-values, while the second element is 
#' used for the Q-values (FDR adjusted P-values) similar to
#' \link{calculatePvalues}.
#' @param minoverlap Determines the mininum overlap needed when annotating 
#' regions with \link{annotateRegions}.
#' @param mergePrep If \code{TRUE} the output from \link{preprocessCoverage} is 
#' merged. 
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{ If \code{TRUE} basic status updates will be printed along 
#' the way.}
#' \item{optionsStats }{ The options used in \link{analyzeChr}. By default 
#' \code{NULL} and will be inferred from the output files.}
#' \item{cutoffFstatUsed }{ The actual F-statistic cutoff used. This can be 
#' obtained from the logs or from the output of \link{analyzeChr}. If 
#' \code{NULL} then this function will attempt to re-calculate it.}
#' }
#' Passed to \link{annotateRegions} and \link{extendedMapSeqlevels}.
#' @inheritParams annotateRegions
#'
#'
#' @return Seven Rdata files.
#' \describe{
#' \item{fullFstats.Rdata }{ Full F-statistics from all chromosomes in a list 
#' of Rle objects.}
#' \item{fullTime.Rdata }{ Timing information from all chromosomes.}
#' \item{fullNullSummary.Rdata}{ A DataFrame with the null region information: 
#' statistic, width, chromosome and permutation identifier. It's ordered by the 
#' statistics}
#' \item{fullRegions.Rdata}{ GRanges object with regions found and with full 
#' annotation from \link[bumphunter]{matchGenes}. Note that the column 
#' \code{strand} from \link[bumphunter]{matchGenes} is renamed to 
#' \code{annoStrand} to comply with GRanges specifications. }
#' \item{fullCoveragePrep.Rdata}{ A list with the pre-processed coverage data 
#' from all chromosomes.}
#' \item{fullAnnotatedRegions.Rdata}{ A list as constructed in 
#' \link{annotateRegions} with the assigned genomic states.}
#' \item{optionsMerge.Rdata}{ A list with the options used when merging the 
#' results. Used in \code{derfinderReport::generateReport}.}
#' }
#'
#' @author Leonardo Collado-Torres
#' @seealso \link{analyzeChr}, \link{calculatePvalues}, \link{annotateRegions}
#' @export
#'
#' @importFrom GenomicRanges GRangesList
#' @importMethodsFrom GenomicRanges '$' '$<-' '['
#' @importFrom IRanges RleList
#' @import S4Vectors
#' @importMethodsFrom IRanges cbind values 'values<-' '[' length '$<-'
#' order unlist nrow
#' @importFrom qvalue qvalue
#'
#' @details If you want to calculate the FWER adjusted P-values, supply 
#' \code{optionsStats} which is produced by \link{analyzeChr}.
#'
#' @examples
#' ## The output will be saved in the 'generateReport-example' directory
#' dir.create('generateReport-example', showWarnings = FALSE, recursive = TRUE)
#' 
#' ## For convenience, the derfinder output has been pre-computed
#' file.copy(system.file(file.path('extdata', 'chr21'), package='derfinder', 
#' mustWork=TRUE), 'generateReport-example', recursive=TRUE)
#'
#' ## Merge the results from the different chromosomes. In this case, there's 
#' ## only one: chr21
#' mergeResults(chrs='21', prefix='generateReport-example', 
#' genomicState=genomicState$fullGenome)
#'
#' \dontrun{
#' ## You can then explore the wallclock time spent on each step
#' load(file.path('generateReport-example', 'fullRegions.Rdata'))
#' 
#' ## Process the time info
#' time <- lapply(fullTime, function(x) data.frame(diff(x)))
#' time <- do.call(rbind, time)
#' colnames(time) <- 'sec'
#' time$sec <- as.integer(round(time$sec))
#' time$min <- time$sec / 60
#' time$chr <- paste0('chr', gsub('\\..*', '', rownames(time)))
#' time$step <- gsub('.*\\.', '', rownames(time))
#' rownames(time) <- seq_len(nrow(time))
#' 
#' ## Make plot
#' library('ggplot2')
#' ggplot(time, aes(x=step, y=min, colour=chr)) + geom_point() + 
#'     labs(title='Wallclock time by step') + 
#'     scale_colour_discrete(limits=chrs) + 
#'     scale_x_discrete(limits=names(fullTime[[1]])[-1]) + ylab('Time (min)') + 
#'     xlab('Step')
#' }

mergeResults <- function(chrs = c(1:22, 'X', 'Y'), prefix = '.', 
    significantCut = c(0.05, 0.1), genomicState, minoverlap = 20, 
    mergePrep = FALSE, ...) {
    ## For R CMD check
    prep <- fstats <- regions <- annotation <- timeinfo <- NULL
    
    stopifnot(length(significantCut) == 2 & all(significantCut >= 
        0 & significantCut <= 1))

    ## Advanged argumentsa
# @param verbose If \code{TRUE} basic status updates will be printed along the 
# way.
    verbose <- .advanced_argument('verbose', TRUE, ...)

# @param optionsStats The options used in \link{analyzeChr}.
    optionsStats <- .advanced_argument('optionsStats', NULL, ...)
    
# @param cutoffFstatUsed The actual F-statistic cutoff used. This can be 
# obtained from the logs or from the output of \link{analyzeChr}. If 
# \code{NULL} then this function will attempt to re-calculate it.
    cutoffFstatUsed <- .advanced_argument('cutoffFstatUsed', optionsStats$cutoffFstatUsed, ...)



    ## Use UCSC names for homo_sapiens by default
    chrs <- extendedMapSeqlevels(chrs, ...)
    
    ## save merging options used
    optionsMerge <- list(chrs = chrs, significantCut = significantCut, 
        minoverlap = minoverlap, mergeCall = match.call(),
        cutoffFstatUsed = cutoffFstatUsed, ...)
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: Saving options used'))
    save(optionsMerge, file = file.path(prefix, 'optionsMerge.Rdata'))
    
    ## Initialize
    fullCoveragePrep <- fullTime <- fullNullPermutation <- fullNullWidths <- 
        fullNullStats <- fullFstats <- fullAnno <- fullRegs <- vector('list', 
        length(chrs))
    names(fullCoveragePrep) <- names(fullTime) <- names(fullNullPermutation) <-
        names(fullNullWidths) <- names(fullNullStats) <- names(fullFstats) <-
        names(fullAnno) <- names(fullRegs) <- chrs
    
    ## Actual processing
    for (chr in chrs) {
        if (verbose) 
            message(paste(Sys.time(), 'Loading chromosome', chr))
        
        ## Process the F-statistics
        load(file.path(prefix, chr, 'fstats.Rdata'))
        fullFstats[[chr]] <- fstats
        
        ## Process the regions, nullstats and nullwidths
        load(file.path(prefix, chr, 'regions.Rdata'))
        fullRegs[[chr]] <- regions$regions
        fullNullStats[[chr]] <- regions$nullStats
        fullNullWidths[[chr]] <- regions$nullWidths
        fullNullPermutation[[chr]] <- regions$nullPermutation
        
        ## Process the annotation results
        load(file.path(prefix, chr, 'annotation.Rdata'))
        fullAnno[[chr]] <- annotation
        
        ## Process the timing information
        load(file.path(prefix, chr, 'timeinfo.Rdata'))
        fullTime[[chr]] <- timeinfo
        
        ## Process the covPrep data
        if (mergePrep) {
            load(file.path(prefix, chr, 'coveragePrep.Rdata'))
            fullCoveragePrep[[chr]] <- prep
        }
    }
    
    ## Merge regions
    fullRegions <- unlist(GRangesList(fullRegs), use.names = FALSE)
    
    ## Process the annotation
    fullAnnotation <- do.call(rbind, fullAnno)
    if (!is.null(fullAnnotation)) {
        colnames(fullAnnotation)[which(colnames(fullAnnotation) == 
            'strand')] <- 'annoStrand'
        rownames(fullAnnotation) <- NULL
        
        ## For some reason, signature 'AsIs' does not work when
        ## assigning the values() <-
        fullAnnotation$name <- as.character(fullAnnotation$name)
        fullAnnotation$annotation <- as.character(fullAnnotation$annotation)
        
        ## Combine regions with annotation
        values(fullRegions) <- cbind(values(fullRegions),
            DataFrame(fullAnnotation, check.names = FALSE))
    }
    
    
    ## Summarize the null regions
    nulls <- unlist(RleList(fullNullStats), use.names = FALSE)
    widths <- unlist(RleList(fullNullWidths), use.names = FALSE)
    permutations <- unlist(RleList(fullNullPermutation), use.names = FALSE)
    howMany <- unlist(lapply(fullNullStats, length))
    
    if (length(nulls) > 0) {
        ## Proceed only if there are null regions to work with
        fullNullSummary <- DataFrame(stat = nulls, width = widths, 
            chr = Rle(names(fullNullStats), howMany),
            permutation = permutations, check.names = FALSE)
        rm(nulls, widths, howMany, permutations)
        
        fullNullSummary$area <- abs(fullNullSummary$stat) * 
            fullNullSummary$width
        fullNullSummary <- fullNullSummary[order(fullNullSummary$area, 
            decreasing = TRUE), ]
    } else {
        fullNullSummary <- DataFrame(NULL)
    }
    
    ## Determine F-stat cutoff if not supplied
    if(is.null(cutoffFstatUsed)) {
        if(!is.null(optionsStats)) {
            stopifnot(all(c('cutoffType', 'cutoffFstat', 'models') %in% names(optionsStats)))
            if (verbose) 
                message(paste(Sys.time(), 
                    'mergeResults: calculating F-stat cutoff'))
            cutoffFstatUsed <- .calcFstatCutoff(
                cutoffType = optionsStats$cutoffType,
                cutoffFstat = optionsStats$cutoffFstat,
                fstats = unlist(fullFstats),
                models = optionsStats$models
            )
        } else {
            if (verbose) 
                message("Neither 'cutoffFstatUsed' nor 'optionsStats' were supplied, so the FWER calculation step will be skipped.")
        }
        
    }
    if(!is.null(cutoffFstatUsed)) {
        if(nrow(fullNullSummary) > 0) {
            if (verbose) 
                message(paste(Sys.time(), 
                    'mergeResults: calculating FWER'))
            stopifnot(!is.null(optionsStats))
            stopifnot('nPermute' %in% names(optionsStats))
            fullRegions$fwer <- .calculateFWER(
                cutoffFstatUsed = cutoffFstatUsed,
                areaNull = fullNullSummary$area,
                permutation = fullNullSummary$permutation, 
                nPermute = optionsStats$nPermute,
                areaReg = fullRegions$area)
            fullRegions$significantFWER <- factor(fullRegions$fwer < significantCut[1],
                levels = c(TRUE, FALSE))
        } else {
            if (verbose)
                warning('No null regions were found, so the FWER calculation step will be skipped.')
        }
    }
    
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: Saving fullNullSummary'))
    save(fullNullSummary, file = file.path(prefix, 'fullNullSummary.Rdata'))
    
    ## Re-calculate p-values and q-values
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: Re-calculating the p-values'))
    
    ## Sort by decreasing area
    fullRegions <- fullRegions[order(fullRegions$area, decreasing = TRUE)]
    
    if (nrow(fullNullSummary) > 0) {
        ## Actual calculation
        fullRegions$pvalues <- .calcPval(fullRegions$area,
            as.numeric(fullNullSummary$area))
        
        ## Update info        
        fullRegions$significant <- factor(fullRegions$pvalues < 
            significantCut[1], levels = c(TRUE, FALSE))
        
        ## Sometimes qvalue() fails due to incorrect pi0 estimates
        qvalues <- tryCatch(qvalue(fullRegions$pvalues), error = function(e) NULL)
        if (!is.null(qvalues)) {
            qvalues <- qvalues$qvalues
            sigQval <- factor(qvalues < significantCut[2], levels = c(TRUE, 
                FALSE))
        } else {
            message(paste(Sys.time(), "mergeResults: skipping q-value calculation."))
            qvalues <- rep(NA, length(fullRegions$pvalues))
            sigQval <- rep(NA, length(fullRegions$pvalues))
        }
        fullRegions$qvalues <- qvalues
        fullRegions$significantQval <- sigQval
    }   
    
    ## save GRanges version
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: Saving fullRegions'))
    save(fullRegions, file = file.path(prefix, 'fullRegions.Rdata'))
    
    ## Assign genomic states
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: assigning genomic states'))
    fullAnnotatedRegions <- annotateRegions(regions = fullRegions, 
        genomicState = genomicState, minoverlap = minoverlap, 
        annotate = TRUE, ...)
    
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: Saving fullAnnotatedRegions'))
    save(fullAnnotatedRegions, file = file.path(prefix,
        'fullAnnotatedRegions.Rdata'))
    
    ## Save Fstats, Nullstats, and time info
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: Saving fullFstats'))
    save(fullFstats, file = file.path(prefix, 'fullFstats.Rdata'))
    
    if (verbose) 
        message(paste(Sys.time(), 'mergeResults: Saving fullTime'))
    save(fullTime, file = file.path(prefix, 'fullTime.Rdata'))
    
    if (mergePrep) {
        if (verbose) 
            message(paste(Sys.time(), 'mergeResults: Saving fullCoveragePrep'))
        save(fullCoveragePrep, file = file.path(prefix,
            'fullCoveragePrep.Rdata'))
    }
    
    ## Finish
    return(invisible(NULL))
}

## Calculate FWER adjustments as used in the brainDERs project
.calculateFWER <- function(cutoffFstatUsed, areaNull, permutation, nPermute, areaReg) {
    nullList <- split(as.numeric(areaNull), factor(as.numeric(permutation), levels = seq_len(nPermute)))
    nullList[sapply(nullList, length) == 0] <- cutoffFstatUsed
    maxArea <- sapply(nullList, max)
    fwer <- (sapply(areaReg, function(x) sum(maxArea > x)) + 1) / (length(maxArea) + 1)
    return(fwer)
}
Bioconductor-mirror/derfinder documentation built on Aug. 8, 2017, 5:46 a.m.