R/process.coverage.reports.R
In varitas: Variant Calling in Targeted Analysis Sequencing Data

Documented in process.coverage.reports

#' Process coverageBed reports
#'  
#' @description
#' Process the coverage reports generated by bedtools coverage tool. 
#' 
#' @inheritParams get.coverage.by.sample.statistics
#'
#' @return  final.statistics data frame of coverage statistics generated by parsing through coverage reports
process.coverage.reports <- function(project.directory) {
    
    # TO DO:
    # 	- add tests for reports having expected format
    #	- ask Ros what the "cumulative" coverage numbers are supposed to mean
    
    coverage.report.paths <- system.ls(pattern = "*/*all.coverage.report", directory = project.directory, error = TRUE);
    sample.ids <- extract.sample.ids(coverage.report.paths, from.filename = TRUE);
    single.sample <- !(length(sample.ids) > 1)
    
    # store mean and median coverage per patient
    mean.median.by.sample <- list();
    
    # initialize data frame to store coverage data for all samples
    merged.coverage.data <- data.frame();
    
    
    ### PROCESS EACH SAMPLE 
    for(i in seq_along(coverage.report.paths)) {
        
        path <- coverage.report.paths[i];
        sample.id <- sample.ids[i];
        
        # generated from coverageBed, based on all target regions
        # 	1) 
        # 	2) depth
        # 	3) no. of bases at depth
        #	4) size of A
        # 	5) % of A at depth
        coverage.data <- utils::read.delim(
            path,
            header = FALSE,
            stringsAsFactors = FALSE	
        );
        
        depth.values <- coverage.data[, 2];
        depth.frequencies <- coverage.data[, 3];
        
        # data frame for merging with all other patients
        patient.coverage <- data.frame(depth.values, depth.frequencies);
        names(patient.coverage) <- c('depth', sample.id);
        
        # get the median coverage for the sample
        median.coverage <- tabular.median(
            values = depth.values,
            frequencies = depth.frequencies
            );
        
        # get the mean coverage for each sample
        mean.coverage <- tabular.mean(
            values = depth.values,
            frequencies = depth.frequencies
            );
        
        mean.median.by.sample[[ sample.id ]] <- data.frame(
            "sample.id" = sample.id,
            "mean.coverage" = mean.coverage, 
            "median.coverage" = median.coverage
        );	
        
        # merge with full data frame
        if( 0 == nrow(merged.coverage.data) ) {
            merged.coverage.data <- patient.coverage;				
        } else {
            
            merged.coverage.data <- merge(
                merged.coverage.data, 
                patient.coverage,
                by.x = "depth", 
                by.y = "depth",
                all = TRUE
            );
        }
    }
    
    ### POST-PROCESSING
    
    mean.median.by.sample <- do.call(rbind, mean.median.by.sample);
    
    merged.coverage.data$bin <- cut(
        merged.coverage.data$depth,
        breaks = c(-Inf, 1, 10, 20, 30, 40, Inf),
        labels = c("0", "1-10", "10-20", "20-30", "30-40", "40+")
        # labels = c(0, 1, 10, 20, 30, 40)
    );
    
    # for each patient, get proportion of frequencies falling within each depth category
    if (!single.sample){
      coverage.statistics <- apply(
          merged.coverage.data[, 2:(ncol(merged.coverage.data) - 1)],
          2, 
          FUN = function(x, coverage.bin) {
              tapply(x, coverage.bin, sum, na.rm = TRUE )/ sum(x, na.rm = TRUE);
          },
          coverage.bin = merged.coverage.data$bin
      );
    } else {
      coverage.statistics <- data.frame(sample.ids=tapply(merged.coverage.data[, 2], merged.coverage.data$bin, sum, na.rm = TRUE) / sum(merged.coverage.data[ ,2], na.rm = TRUE))
    }
    
    # for all categories except the first one, get the proportion of frequencies falling into that category or higher
    # 	- first category (coverage zero) is still just the proportion falling in that category
    for( i in 2:(nrow(coverage.statistics)-1) ) {
        if (!single.sample) {
          coverage.statistics[i,] <- apply(coverage.statistics[i:nrow(coverage.statistics),], 2, sum, na.rm = TRUE)
        } else {
          coverage.statistics[i,] <- sum(coverage.statistics[i:nrow(coverage.statistics),], na.rm = TRUE)
        }
    }
    
    # transpose
    #if (!single.sample) {
    names(coverage.statistics) <- sample.ids;
    coverage.statistics <- t(coverage.statistics);
    #} else {
    #  rownames(coverage.statistics) <- sample.ids
    #}
    
    # add mean/ median per sample
    # make sure they're ordered the same way – anything else will lead to disappointment
    if (!single.sample) {
      mean.median.by.sample <- mean.median.by.sample[rownames(coverage.statistics), ];
    }
    
    print(mean.median.by.sample);
    print(coverage.statistics);


    # sanity check
    if( !identical(rownames(coverage.statistics), rownames(mean.median.by.sample) ) ) {
        stop("coverage.statistics and mean.median.by.sample do not appear to be in the same order. Please investigate.");	
    }
    
    # assemble final data frame
    final.statistics <- cbind(
        coverage.statistics,
        mean.median.by.sample
    );
    
    return(final.statistics); 
}