R/process.total.coverage.statistics.R
In varitas: Variant Calling in Targeted Analysis Sequencing Data

Documented in process.total.coverage.statistics

#' Process total coverage statistics
#'
#' @description
#' Process reports generated by flagstat.
#' Assumes reports for before and after off-target filtering have been written to the same file, with separating headers
#'
#' @inheritParams get.coverage.by.sample.statistics
#'
#' @return data frame with extracted statistics
process.total.coverage.statistics <- function(project.directory) {

    flagstat.report.paths <- system.ls(pattern = "*/*stats", directory = project.directory, error = TRUE);
    sample.ids <- extract.sample.ids(flagstat.report.paths, from.filename = TRUE);

    total.coverage.statistics <- list();

    for(i in 1:length(flagstat.report.paths)) {

        ### READ DATA

        filepath <- flagstat.report.paths[i];
        sample.id <- sample.ids[i];

        input.lines <- scan(
            filepath,
            what = character(),
            sep = "\n",
            quiet = TRUE
        );

        ### INPUT TESTS
        if(28 != length(input.lines)) {
            error.message <- paste("Read", length(input.lines), "from Flagstat report. Expected 28 lines.");
            stop(error.message);
        }

        if("Flagstat before filtering off-target reads" != input.lines[1]) {
            stop('Expected line 1 to be "Flagstat before filtering off-target reads"');
        }

        if("Flagstat after filtering off-target reads" != input.lines[15]) {
            stop('Expected line 15 to be "Flagstat after filtering off-target reads"');
        }

        ### EXTRACT NUMBERS
        #
        # this is based on order of flagstat report => not very robust
        # have tried to add lots of input tests to compensate

        ontarget.reads <- stringr::str_extract(input.lines[16], "\\d+") %>% as.numeric;
        total.reads <- stringr::str_extract(input.lines[2], "\\d+") %>% as.numeric;
        mapped.reads <- stringr::str_extract(input.lines[6], "\\d+") %>% as.numeric;
        paired.reads <- stringr::str_extract(input.lines[7], "\\d+") %>% as.numeric;

        sample.coverage.statistics <- data.frame(
            "sample.id" = sample.id,
            "total.reads" = total.reads,
            "mapped.reads" = mapped.reads,
            "ontarget.reads" = ontarget.reads,
            "ontarget.percent" = ontarget.reads/total.reads,
            "mapped.percent" = mapped.reads/total.reads,
            "paired.reads" = paired.reads,
            "paired.percent" = paired.reads/total.reads
        );

        total.coverage.statistics[[ sample.id ]] <- sample.coverage.statistics;

    }

    total.coverage.statistics <- do.call(rbind, total.coverage.statistics);

    ### SANITY CHECKS

    if( any(total.coverage.statistics$mapped.reads > total.coverage.statistics$total.reads) ) {
        stop("One or more samples has more mapped reads than total reads. That can't be right.");
    }

    if( any(total.coverage.statistics$paired.reads > total.coverage.statistics$total.reads) ) {
        stop("One or more samples has more paired reads than total reads. That can't be right.");
    }

    if( any(total.coverage.statistics$ontarget.percent > 1) ) {
        stop("One or more samples has on-target percent greater than 100.");
    }

    return(total.coverage.statistics);
}