R/importNgsLogs.R

Defines functions .parseFlagstatLogs .parseQuastLogs .parseTrimmomaticLogs .parseBuscoLogs .parseFeatureCountsLogs .parseCutadaptLogs .parseAdapterRemovalLogs .parseDuplicationMetricsLogs .parseStarLogs .parseHisat2Logs .parseBowtieLogs .isValidFlagstatLog .isValidQuastLog .isValidTrimmomaticLog .isValidBuscoLog .isValidFeatureCountsLog .isValidCutadaptLog .isValidAdapterRemovalLog .isValidDuplicationMetricsLog .isValidStarLog .isValidHisat2Log .isValidBowtieLog .getToolName importNgsLogs

Documented in .getToolName importNgsLogs .isValidAdapterRemovalLog .isValidBowtieLog .isValidBuscoLog .isValidCutadaptLog .isValidDuplicationMetricsLog .isValidFeatureCountsLog .isValidFlagstatLog .isValidHisat2Log .isValidQuastLog .isValidStarLog .isValidTrimmomaticLog .parseAdapterRemovalLogs .parseBowtieLogs .parseBuscoLogs .parseCutadaptLogs .parseDuplicationMetricsLogs .parseFeatureCountsLogs .parseFlagstatLogs .parseHisat2Logs .parseQuastLogs .parseStarLogs .parseTrimmomaticLogs

#' @title Import Various NGS-related log files
#'
#' @description Imports NGS-related log files such as those generated from
#' stderr \lifecycle{maturing}
#'
#' @details Imports one or more log files as output by tools such as:
#' \code{bowtie}, \code{bowtie2}, \code{featureCounts}, \code{Hisat2},
#' \code{STAR}, \code{picard MarkDuplicates}, \code{cutadapt}, \code{flagstat},
#' \code{Adapter Removal}, \code{trimmomatic} \code{quast} or \code{busco}.
#' \code{autoDetect} can be used to detect the log type by parsing the file.
#'
#' The featureCounts log file corresponds to the \code{counts.out.summary},
#' not the main \code{counts.out} file.
#'
#' Whilst most log files return a single tibble, some are more complex
#' with multiple modules.
#'
#' \code{adapterRemoval} can return one of four modules (which = 1:4),.
#' When calling by name, the possible values are sequences, settings,
#' statistics or distribution.
#' Partial matching is implemented.
#'
#' \code{cutadapt} can return one of five modules (which = 1:5).
#' When calling by name the possible modules are summary, adapter1, adapter2,
#' adapter3 or overview.
#' Note that adapter2/3 may be missing from these files depending on the nature
#' of your data.
#' If cutadapt log files are obtained using \code{report=minimal}, all supplied
#' log files must be of this format and no modules can be returned.
#'
#' \code{duplicationMetrics} will return either the metrics of histogram.
#' These can be requested by setting which as 1 or 2, or naming either module.
#'
#' @param x \code{character}. Vector of filenames. All log files must be of the
#' same type. Duplicate file paths will be silently ignored.
#' @param type \code{character}. The type of file being imported. Can be one of
#' \code{bowtie}, \code{bowtie2}, \code{hisat2}, \code{star}, \code{flagstat},
#' \code{featureCounts}, \code{duplicationMetrics}, \code{cutadapt},
#' \code{adapterRemoval}, \code{quast} or \code{busco}.
#' Defaults to \code{type = "auto"} which will automatically detect the file
#' type for all implemented types.
#' @param which Which element of the parsed object to return. Ignored in all
#' file types except when \code{type} is set to duplicationMetrics, cutadapt or
#' adapterRemoval. See details for possible values
#' @param stripPaths logical(1). Remove paths from the Filename column
#'
#' @return A \code{tibble}.
#' Column names are broadly similar to the text in supplied files,
#' but have been modified for easier handling under R naming conventions.
#'
#' @examples
#' f <- c("bowtiePE.txt", "bowtieSE.txt")
#' bowtieLogs <- system.file("extdata", f, package = "ngsReports")
#' df <- importNgsLogs(bowtieLogs, type = "bowtie")
#'
#' @importFrom lubridate dminutes dhours dseconds parse_date_time
#' @importFrom tidyselect contains everything starts_with ends_with
#' @importFrom stringr str_replace_all str_trim str_remove_all str_extract
#'
#' @export
importNgsLogs <- function(x, type = "auto", which, stripPaths = TRUE) {

    x <- unique(x) # Remove any  duplicates
    stopifnot(length(x) > 0) # Fail on empty vector
    stopifnot(file.exists(x)) # Check they all exist

    ## Check for a valid filetype
    possTypes <- c(
        "adapterRemoval",
        "bowtie",
        "bowtie2",
        "busco",
        "cutadapt",
        "duplicationMetrics",
        "featureCounts",
        "flagstat",
        "hisat2",
        "quast",
        "star",
        "trimmomatic"
    )

    type <- match.arg(type, c("autoDetect", possTypes))


    ## Sort out the which argument
    if (missing(which)) {
        which <- 1L
        if (type == "adapterRemoval") {
            message("Defaulting to the statistics module for Adapter Removal")
            which <- 3L
        }
    }

    ## Load the data
    data <- suppressWarnings(lapply(x, readLines))
    if (stripPaths) x <- basename(x)
    names(data) <- x

    ## adding auto detect
    if (type == "autoDetect") {
        type <- unlist(lapply(data, .getToolName, possTypes = possTypes))
        type <- unique(type)
    }

    ## Change to title case for easier parsing below
    type <- stringr::str_split_fixed(type, pattern = "", n = nchar(type))
    type[1] <- stringr::str_to_upper(type[1])
    type <- paste(type, collapse = "")

    ## Check validity using the helpers defined as private functions
    vFun <- paste0(".isValid", type, "Log")
    validLogs <- vapply(data, eval(parse(text = vFun)), logical(1))
    if (any(!validLogs)) {
        failed <- names(validLogs)[!validLogs]
        stop(paste("Incorrect file structure for:", failed , collapse = "\n"))
    }

    ## The two types of cutadapt output also need to be checked for consistency
    if (type == "Cutadapt") {
        ln <- vapply(data, length, integer(1))
        nMinimal <- sum(ln == 2) # nLines for a minimal report
        if (!nMinimal %in% c(0, length(data))) stop(
            paste0(
                "Some, but not all, reports are minimal format.\n",
                "Please load different format reports separately"
            )
        )

    }

    ## Parse the data using the helpers defined as private functions
    if (is.character(which)) which <- paste0("'", which, "'")
    pFun <- paste0(".parse", type, "Logs(data, which = ", which, ")")
    df <- eval(parse(text = pFun))

    ## Return a tibble
    as_tibble(df)

}



#' @title Identify tool name
#' @description Identify tool name for log files after
#' reading in using readLines.
#' @details Checks for all the required fields in the lines provided
#' @param x Character vector as output when readLines to a supplied log file
#' @return logical(1)
#' @keywords internal
.getToolName <- function(x, possTypes){

    logiList <- lapply(possTypes, function(types){
        ## Change to title case for easier parsing below
        types <- stringr::str_split_fixed(types, pattern = "", n = nchar(types))
        types[1] <- stringr::str_to_upper(types[1])
        types <- paste(types, collapse = "")
        vFun <- paste0(".isValid", types, "Log(x)")
        validLogs <- eval(parse(text = paste0(vFun)))

    })
    logi <- unlist(logiList)

    type <- possTypes[logi]
    ## added for autodetect purposes
    if (all(c("hisat2", "bowtie2") %in% type)) type <- "bowtie2"
    type
}

#' @title Check for correct structure of supplied Bowtie log files
#' @description Check for correct structure of supplied Bowtie log files after
#' reading in using readLines.
#' @details Checks for all the required fields in the lines provided
#' @param x Character vector as output when readLines to a supplied log file
#' @return logical(1)
#' @keywords internal
.isValidBowtieLog <- function(x){
    nLines <- length(x)
    fields <- c(
        "Time loading forward index",
        "Time loading mirror index:",
        "Seeded quality full-index search:",
        "# reads processed:",
        "# reads with at least one reported alignment:",
        "# reads that failed to align:",
        "Time searching:",
        "Overall time:"
    )
    chk <- vapply(fields, function(f){any(grepl(f, x))}, logical(1))
    all(chk)
}

#' @title Check for a valid Hisat2 log
#' @description Checks internal structure of the parsed log file
#' @param x Character vector containing parsed log file using the function
#' readLines
#' @return logical(1)
#' @keywords internal
.isValidHisat2Log <- function(x){
    n <- length(x)
    chkLen <- length(x) > 0
    firstLine <- grepl("reads; of these:$", x[1])
    lastLine <- grepl("overall alignment rate$", x[n])
    noAln <- sum(grepl("aligned 0 times$", x)) == 1
    alnExact <- sum(grepl("aligned exactly 1 time$", x)) == 1
    alnG1 <- sum(grepl("aligned >1 times$", x)) == 1
    all(c(chkLen, firstLine, lastLine, noAln, alnExact, alnG1))
}
.isValidBowtie2Log <- .isValidHisat2Log

#' @title Check for a valid Star Alignment log
#' @description Checks internal structure of the parsed log file
#' @param x Character vector containing parsed log file using the function
#' readLines
#' @return logical(1)
#' @keywords internal
.isValidStarLog <- function(x){
    ## Just check for the key fields
    chkStart <- grepl("Started job on", x[1])
    chkUniq <- any(grepl("UNIQUE READS:", x))
    chkMulti <- any(grepl("MULTI-MAPPING READS:", x))
    chkUnmapped <- any(grepl("UNMAPPED READS:", x))
    all(chkStart, chkUniq, chkMulti, chkUnmapped)
}

#' @title Check for a valid Duplication Metrics log
#' @description Checks internal structure of the parsed log file
#' @param x Character vector containing parsed log file using the function
#' readLines
#' @return logical(1)
#' @keywords internal
.isValidDuplicationMetricsLog <- function(x){

    ## Check the METRICS CLASS data
    metricsHeader <- grep("METRICS CLASS\tpicard.sam.DuplicationMetrics", x)

    if (length(metricsHeader) == 1) { # Must appear only once
        metricCols <- c("LIBRARY", "UNPAIRED_READS_EXAMINED",
                        "READ_PAIRS_EXAMINED", "SECONDARY_OR_SUPPLEMENTARY_RDS",
                        "UNMAPPED_READS", "UNPAIRED_READ_DUPLICATES",
                        "READ_PAIR_DUPLICATES", "READ_PAIR_OPTICAL_DUPLICATES",
                        "PERCENT_DUPLICATION", "ESTIMATED_LIBRARY_SIZE")
        ## Check the column names in the log match the expected columns
        checkMetCols <-
            all(names(.splitByTab(x[metricsHeader + 1])) == metricCols)

        ## Check the HISTOGRAM data
        histHeader <- grep("HISTOGRAM\tjava.lang.Double", x)
        stopifnot(length(histHeader) == 1) # Must appear only once
        histCols <- c("BIN", "VALUE")
        checkHistCols <- all(names(.splitByTab(x[histHeader + 1])) == histCols)

        all(checkMetCols, checkHistCols)

    } else FALSE #give false value if missing

}

#' @title Check for a valid AdapterRemoval log
#' @description Checks internal structure of the parsed log file
#' @param x Character vector containing parsed log file using the function
#' readLines
#' @return logical(1)
#' @keywords internal
.isValidAdapterRemovalLog <- function(x){

    ## Check the first line begins with AdapterRemoval
    checkAR <- grepl("AdapterRemoval", x[[1]])

    ## This should contain four modules, all of which are contained
    ## between square brackets
    modNames <- c(
        "[Adapter sequences]",
        "[Adapter trimming]",
        "[Trimming statistics]",
        "[Length distribution]"
    )
    checkModNames <- stringr::str_subset(x[[1]], "^\\[.+") == modNames

    all(checkAR, checkModNames)
}

#' @title Check for a valid cutadapt log
#' @description Checks internal structure of the parsed log file
#' @param x Character vector containing parsed log file using the function
#' readLines
#' @return logical(1)
#' @keywords internal
.isValidCutadaptLog <- function(x){

    ## These can take two forms, minimal & full
    ## The minimal report is like a tab-delimited data.frame
    ## whilst the full report has a more complex structure
    isMinimal <- length(x) == 2
    reqCols <- c(
        "status", "in_reads", "in_bp", "too_short", "too_long",
        "too_many_n",  "out_reads", "w/adapters", "qualtrim_bp", "out_bp"
    )
    if (isMinimal) {
        ## As this is essentially a df, just check the columns
        df <- tryCatch(.splitByTab(x))
        chkCols <- all(reqCols %in% colnames(df))
        return(TRUE)
    }
    ## Check the complete file
    chkCutAdapt <- grepl("cutadapt", x[[1]])
    chkCols <- list(
        status = any(grepl("Finished in", x)),
        in_reads = any(grepl("Total (reads|read pairs) processed", x)),
        in_bp = any(grepl("Total basepairs processed", x)),
        too_short = any(grepl("(Reads|Pairs) that were too short", x)),
        ## Too long may not be in the file
        too_many_n = any(grepl("(Reads|Pairs) with too many N", x)),
        out_reads = any(
            grepl("(Reads|Pairs) written \\(passing filters\\)", x)
        ),
        `w/adapters` = any(grepl("Read.+ with adapter", x)),
        out_bp = any(grepl("Total written \\(filtered\\)", x))
    )
    chkCols <- unlist(chkCols)
    all(c(chkCutAdapt, chkCols))

}

#' @title Check for a valid featureCounts Summary
#' @description Checks internal structure of the parsed file
#' @param x Character vector containing parsed log file using the function
#' readLines
#' @return logical(1)
#' @keywords internal
.isValidFeatureCountsLog <- function(x){

    if (all(grepl("\t", x))){ ### check all lines have a tab,
        ## required by .splitbyTab

        ## Check the Status column
        x <- .splitByTab(x)
        chkCol1 <- colnames(x)[[1]] == "Status"
        chkTypes <- FALSE
        if (chkCol1) {
            ## We can only do this if the Status column checks out
            vals <- c(
                "Assigned",
                "Unassigned_Ambiguity",
                "Unassigned_MultiMapping",
                "Unassigned_NoFeatures",
                "Unassigned_Unmapped",
                "Unassigned_MappingQuality",
                "Unassigned_FragmentLength",
                "Unassigned_Chimera",
                "Unassigned_Secondary",
                "Unassigned_Duplicate"
            )
            chkTypes <- all(vals %in% x[["Status"]])
        }

        all(chkTypes, chkCol1)
    }
    else FALSE ## print false if not

}

#' @title Check for correct structure of supplied BUSCO log files
#' @description Check for correct structure of supplied BUSCO log files after
#' reading in using readLines.
#' @details Checks for all the required fields in the lines provided
#' @param x Character vector as output when readLines to a supplied log file
#' @return logical(1)
#' @keywords internal
.isValidBuscoLog <- function(x){

    fields <- c(
        "# BUSCO version is:",
        "# The lineage dataset is:",
        "# To reproduce this run:",
        "# Summarized benchmarking in BUSCO notation for file ",
        "# BUSCO was run in mode:"
    )
    chk <- vapply(fields, function(f){any(grepl(f, x))}, logical(1))
    all(chk)
}

#' @title Check for correct structure of supplied Trimmomatic log files
#' @description Check for correct structure of supplied Trimmomatic log files
#' after reading in using readLines.
#' @details Checks for all the required fields in the lines provided
#' @param x Character vector as output when readLines to a supplied log file
#' @return logical(1)
#' @keywords internal
.isValidTrimmomaticLog <- function(x){
    n <- length(x)
    checkL1 <- grepl("Trimmomatic[PS]E: Started with arguments", x[[1]])
    checkMain <- grepl("Input.+ Surviving.+ Dropped.+", x[[n - 1]])
    checkLast <- grepl("Trimmomatic[PS]E: Completed successfully", x[[n]])
    all(checkL1, checkMain, checkLast)
}

#' @title Check for correct structure of supplied Quast log files
#' @description Check for correct structure of supplied Quast log files after
#' reading in using readLines.
#' @details Checks for all the required fields in the lines provided
#' @param x Character vector as output when readLines to a supplied log file
#' @return logical(1)
#' @keywords internal
.isValidQuastLog <- function(x){

    fields <- c(
        "# contigs",
        "Largest contig",
        "N50",
        "N75",
        "Total length",
        "L50",
        "L75",
        "# N's per 100 kbp"
    )
    chk <- vapply(fields, function(f){any(grepl(f, x))}, logical(1))
    all(chk)
}

#' @title Check for correct structure of supplied flagstat
#' @description Check for correct structure of supplied flagstat files
#' @details Checks for all the required fields in the lines provided
#' @param x Character vector as output when readLines to a supplied file
#' @return logical(1)
#' @keywords internal
.isValidFlagstatLog <- function(x){

    ## Every line must have a '+' symbol
    nLines <- length(x)
    allPlus <- sum(grepl("\\+", x)) == nLines
    ## Check for required text in the first line
    firstLine <- grepl("QC-passed reads", x[[1]])
    all(allPlus, firstLine)
}

#' @title Parse data from Bowtie log files
#' @description Parse data from Bowtie log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... Not used
#' @return data.frame
#' @keywords internal
.parseBowtieLogs <- function(data, ...){

    ## This will have already been through the validity checks
    df <- lapply(data, function(x){
        x <- gsub("# ", "", x)
        ## Treat the line containing the total reads separately
        total <- grep("Reported .* alignments", x = x, value = TRUE)
        x <- setdiff(x, total)
        ## Split the remaining lines into a nx2 matrix,
        ## then change to titles and remove spaces/dashes
        x <- stringr::str_split_fixed(x, pattern = ": ", 2)
        x[,1] <- stringr::str_to_title(x[,1])
        x[,1] <- gsub("( |-)", "_", x[,1])
        ## Remove those percentages from some of the fields
        x[,2] <- gsub("(.+) \\(.+\\)", "\\1", x[,2])
        ## Return a data.frame
        df <- structure(as.list(x[,2]), names = x[,1])
        df <- as.data.frame(df, stringsAsFactors = FALSE)
    })
    df <- dplyr::bind_rows(df)

    ## Some colnames may have a flag from the original bowtie code
    ## This line will correct that after the title case conversion above
    names(df) <- gsub("__(.)", "_-\\L\\1", names(df), perl = TRUE)

    ## Reformat the columns as integers and durations
    timeCols <- grepl("(Time|Full_Index_Search)", names(df))
    df[!timeCols] <- suppressWarnings(lapply(df[!timeCols], as.integer))
    df[timeCols] <- lapply(df[timeCols], function(x){
        x <- stringr::str_split_fixed(x, ":", 3)
        x <- as.numeric(x)
        x <- matrix(x, ncol = 3)
        dhours(x[,1]) + dminutes(x[,2]) + dseconds(x[,3])
    })
    ## Add the filename, reorder the columns & return a tibble
    df$Filename <- names(data)
    dplyr::select(
        df, "Filename", contains("Reads"), contains("Time"), everything()
    )

}

#' @title Parse data from HISAT2 log files
#' @description Parse data from HISAT2 log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... Not used
#' @return data.frame
#' @keywords internal
.parseHisat2Logs <- function(data, ...){

    df <- lapply(data, function(x){

        x <- stringr::str_trim(x)
        paired <- grepl("were paired", x[2])
        totReads <- gsub("([0-9]*) reads; of these:", "\\1", x[1])
        ## Now find each category then extract the numbers
        noAln <- x[grep("aligned 0 times$", x)]
        noAln <- gsub("([0-9]*) .+ aligned 0 times", "\\1", noAln)
        uniq <- x[grep("aligned exactly 1 time$", x)]
        uniq <- gsub("([0-9]*) .+ aligned exactly 1 time", "\\1", uniq)
        mult <- x[grep("aligned >1 times", x)]
        mult <- gsub("([0-9]*) .+ aligned >1 times", "\\1", mult)

        ## Get the paired only fields
        pairReads <- uniqPairs <- multPairs <- uniqDiscord <- 0
        if (paired) {
            pairReads <- x[grep("were paired; of these:$", x)]
            pairReads <- gsub("([0-9]*) .+ of these:", "\\1", pairReads)
            uniqPairs <- x[grep("aligned concordantly exactly 1 time$", x)]
            uniqPairs <- gsub("([0-9]*) .+ exactly 1 time", "\\1", uniqPairs)
            multPairs <- x[grep("aligned concordantly >1 times$", x)]
            multPairs <- gsub("([0-9]*) .+ >1 times", "\\1", multPairs)
            uniqDiscord <- x[grep("aligned discordantly 1 time$", x)]
            uniqDiscord <- gsub("([0-9]*) .+ 1 time", "\\1", uniqDiscord)
        }

        out <- list(
            Total_Reads = totReads,
            Not_Aligned = noAln,
            Unique_Unpaired = uniq,
            Multiple_Unpaired = mult,
            Paired_Reads = pairReads,
            Unique_In_Pairs = uniqPairs,
            Multiple_In_Pairs = multPairs,
            Unique_Discordant_Pairs = uniqDiscord
        )
        ## Set all values as integers
        out <- lapply(out, as.integer)
        as.data.frame(out)
    })

    ## Bind all reults together
    df <- bind_rows(df)

    ## Add a final column
    df$Alignment_Rate <-
        1 - df$Not_Aligned / (df$Total_Reads + df$Paired_Reads)
    df$Filename <- names(data)
    dplyr::select(
        df,
        "Filename",
        ends_with("Reads"),
        contains("Unique"),
        contains("Multiple"),
        everything()
    )

}
.parseBowtie2Logs <- .parseHisat2Logs

#' @title Parse data from STAR log files
#' @description Parse data from STAR log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... Not used
#' @return tibble
#' @keywords internal
.parseStarLogs <- function(data, ...){
    ## Reformat as a data.frame / tibble
    df <- lapply(data, function(x){
        ## Split on '|\t'
        x <- stringr::str_split_fixed(x, pattern = "\\|\t", n = 2)
        ## Remove whitespace
        x <- apply(x, MARGIN = 2, stringr::str_trim)
        ## Remove blank rows
        x <- x[rowSums(x == "") == 0,]
        ## Tidy up the percent signs & brackets
        x[, 1] <- gsub("%", "Percent", x[,1])
        x <- apply(x, 2, stringr::str_remove_all, "[%\\(\\)]")
        ## Define the column names
        cols <- stringr::str_to_title(x[,1])
        cols <- gsub("[ :,]+", "_", cols)
        cols <-
            gsub("([ACGTacgt]{2}/[ACGTacgt]{2})", "\\U\\1", cols, perl = TRUE)
        ## Form a list then tibble
        out <- as.list(x[,2])
        names(out) <- cols
        ## Using tibble here preserves column names in a nicer format
        as_tibble(out)
    })
    df <- dplyr::bind_rows(df)

    ## Reformat the time columns
    timeCols <- grepl("On$", names(df))
    df[timeCols] <-
        lapply(df[timeCols], parse_date_time, orders = "b! d! HMS")

    ## Reformat the integer columns
    intCols <- grepl("Number", names(df))
    df[intCols] <- lapply(df[intCols], as.integer)

    ## Set the remaining columns as numeric
    df[!intCols & !timeCols] <- lapply(df[!intCols & !timeCols], as.numeric)

    ## Add the filename & additional columns
    df$Filename <- names(data)
    df$Mapping_Duration <- df$Finished_On - df$Started_Mapping_On
    totMapped <-
        df$Uniquely_Mapped_Reads_Number +
        df$Number_Of_Reads_Mapped_To_Multiple_Loci
    df$Total_Mapped_Percent <- 100*totMapped / df$Number_Of_Input_Reads

    ## Return the output
    dplyr::select(
        df,
        "Filename",
        starts_with("Total"),
        contains("Input"),
        contains("Mapped"),
        contains("Splice"),
        ends_with("On"),
        contains("Mapping"),
        everything()
    )

}

#' @title Parse data from Picard duplicationMetrics log files
#' @description Parse data from Picard duplicationMetrics log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param which which element of the log file to return.
#' Can be 1:2, "metrics" or "histogram"
#' @return tibble
#' @keywords internal
.parseDuplicationMetricsLogs <- function(data, which = 1){

    if (is.character(which))
        which <- match.arg(which, c("metrics", "histogram"))
    if (is.numeric(which)) stopifnot(which %in% c(1, 2))

    ## Collect the metrics from all files as a tibble
    metrics <- lapply(data, function(x){
        ## Find the library name
        libName <- grep(
            "picard.sam.markduplicates.MarkDuplicates.+INPUT=", x, value = TRUE
        )
        libName <- gsub(".+INPUT=\\[(.+)\\] OUTPUT.+", "\\1", libName[[1]])
        ## Find the header. The next two rows will be the colnames + data
        metHeader <- grep("METRICS CLASS\tpicard.sam.DuplicationMetrics", x)
        df <- .splitByTab(x[seq(metHeader + 1, by = 1, length.out = 2)])
        df$LIBRARY <- libName
        df
    })
    metrics <- dplyr::bind_rows(metrics)
    ## Now ensure the correct types
    metrics$PERCENT_DUPLICATION <- as.numeric(metrics$PERCENT_DUPLICATION)
    intCols <- setdiff(colnames(metrics), c("LIBRARY", "PERCENT_DUPLICATION"))
    metrics[intCols] <- lapply(metrics[intCols], as.integer)
    metrics <- as_tibble(metrics)

    ## Collect the histogram data from all files as a tibble
    histData <- lapply(data, function(x){
        ## Find the library name
        libName <- grep(
            "picard.sam.markduplicates.MarkDuplicates.+INPUT=", x, value = TRUE
        )
        libName <- gsub(".+INPUT=\\[(.+)\\] OUTPUT.+", "\\1", libName[[1]])

        ## Find the header then remove up until that line
        histHeader <- grep("HISTOGRAM\tjava.lang.Double", x)
        x <- x[-seq_len(histHeader)]
        ## Remove any blank lines (this is the last line)
        x <- x[!grepl("^$", x)]
        df <- .splitByTab(x)
        df$LIBRARY <- libName
        dplyr::select(df, "LIBRARY", everything())
    })
    histData <- dplyr::bind_rows(histData)
    ## Ensure the correct types
    histData$BIN <- as.integer(histData$BIN)
    histData$VALUE <- as.numeric(histData$VALUE)
    histData <- as_tibble(histData)

    ## Setup the output, then format the column names
    out <- list(metrics = metrics, histogram = histData)
    lapply(out, function(x){
        colnames(x) <- stringr::str_replace_all(colnames(x), "_", " ")
        colnames(x) <- stringr::str_to_title(colnames(x))
        x
    })
    out[[which]]
}

#' @title Parse data from Adapter Removal log files
#' @description Parse data from Adapter Removal log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param which which element of the log file to return.
#' Can be 1:4, "sequences", "settings", "statistics" or "distribution"
#' @return tibble
#' @keywords internal
#' @importFrom stringr str_split_fixed
.parseAdapterRemovalLogs <- function(data, which = 3){

    ## These are renamed for easier understanding.
    ## The full mapping of titles are:
    ## 1: sequences = Adapter sequences
    ## 2: settings = Adapter trimming
    ## 3: statistics =  Trimming statistics
    ## 4: distribution = Length distribution
    ## No parsing of the version number has been implemented (1st row)
    ## Note that for each file, 'distribution' will return a tibble
    ## with nrow > 1
    modNames <- c("sequences", "settings", "statistics", "distribution")
    if (is.character(which)) which <- match.arg(which, modNames)
    if (is.numeric(which)) stopifnot(which %in% seq_len(4))

    .parseArSingle <- function(x, which){
        ## Remove the blank elements
        x <- setdiff(x, "")
        ## Find where the modules start
        vals2Mods <- cumsum(stringr::str_detect(x, "^\\["))

        ## Just do the modules manually
        ## Start with the sequences
        sequences <- str_split_fixed(
            string = x[vals2Mods == 1][-1],
            pattern = ": ",
            n =  2
        )
        sequences <- as_tibble(
            split(sequences[,2], f = sequences[,1])
        )
        ## Now the settings
        settings <- str_split_fixed(
            string = x[vals2Mods == 2][-1],
            pattern = ": ",
            n =  2
        )
        settings <- split(
            x = settings[,2],
            f = factor(settings[,1], levels = settings[,1])
        )
        ## Convert to the correct value type
        intCols <- grepl("(score |value|length|Minimum)", names(settings))
        intCols <- intCols & !grepl("Maximum", names(settings))
        settings[intCols] <- lapply(settings[intCols], as.integer)
        numCols <- grepl("threshold", names(settings))
        settings[numCols] <- lapply(settings[numCols], as.numeric)
        settings <- as_tibble(settings)
        settings[["RNG seed"]] <- ifelse(
            settings[["RNG seed"]] == "NA",
            NA_integer_,
            as.integer(settings[["RNG seed"]])
        )
        ## Statistics
        statistics <- str_split_fixed(x[vals2Mods == 3][-1], ": ", 2)
        statistics <- split(
            x = statistics[,2],
            f = factor(statistics[,1], levels = statistics[,1])
        )
        statistics <- lapply(statistics, as.numeric)
        statistics <- as_tibble(statistics)
        ## Distribution
        distribution <- .splitByTab(x[vals2Mods == 4][-1])
        distribution <- lapply(distribution, as.integer)
        distribution <- as_tibble(distribution)

        ## Setup the output, then format the column names
        out <- list(
            sequences = sequences,
            settings = settings,
            statistics = statistics,
            distribution = distribution
        )
        out[[which]]
    }

    arOut <- lapply(data, .parseArSingle, which = which)
    arOut <- lapply(names(arOut), function(x){
        x <- dplyr::mutate(arOut[[x]], Filename = x)
        dplyr::select(x, Filename, everything())
    })
    ## Now bind all tibbles & returns
    bind_rows(arOut)

}

#' @title Parse data from cutadapt log files
#' @description Parse data from cutadapt log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param which which element of the log file to return.
#' Can be summary, adapter1, adapter2, adapter3 or overview, or any integer in
#' 1:5
#' @return tibble
#' @keywords internal
#' @importFrom tidyselect everything
#' @importFrom dplyr mutate_at mutate_if
#' @importFrom stringr str_replace_all str_remove_all str_extract
.parseCutadaptLogs <- function(data, which = 1){

    ## Possible module names vary depending on PE/SE reads and the adapter
    ## configuration. The only stable name is `summary`.
    ## For this reason module name checking will be handled inside the
    ## parsing function

    .parseCutAdaptSingle <- function(x, which){

        ## Check if it is a minimal report & return the df if TRUE
        isMinimal <- length(x) == 2
        if (isMinimal) {
            df <- .splitByTab(x)
            ## Values may sometimes be > 2^31 so convert to doubles
            df <- dplyr::mutate_if(df, colnames(df) != "status", as.numeric)
            if (which != 1) message(
                paste0(
                    "Minimal report provided.",
                    "The 'which' argument will be ignored"
                )
            )
            return(df)
        }

        ## Otherwise parse the modules:
        ## Remove the blank elements
        x <- x[x != ""]
        ## Find where the modules start
        mods <- grepl("===", x)
        foundMods <- tolower(str_remove_all(x[mods],"[= ]"))

        ## Check the requested module
        if (is.numeric(which) & which > length(foundMods)) stop(
            "Invalid module selected. The maximum number is ", length(foundMods)
        )
        if (is.character(which) & sum(grepl(which, foundMods)) != 1) stop(
            "Unable to determine module. Valid modules are ",
            paste(foundMods, collapse = "/")
        )

        ## Now split into the modules
        x <- split(x, f = cumsum(mods))
        names(x) <- c("header", foundMods)
        ## Grab the header as a list
        hdr <- x[["header"]]
        names(hdr) <- c("version", "params", "processing", "finished")
        ## Remove the first value from each now we have the intact header
        x <- lapply(x, function(x){x[-1]})

        ## Just do the modules manually
        out <- vector("list", length(x) - 1)
        names(out) <- names(x)[-1]

        ## Start with the summary (i.e. minimal). Values then names
        vals <- str_replace_all(x$summary, ".+ +([0-9,]+).+", "\\1")
        vals <- str_remove_all(vals, ",")
        vals <- as.numeric(vals)
        nm <- str_trim(
            str_replace_all(x$summary, "(.+):.+", "\\1")
        )
        ## If this is paired end, some of these will be nested and need to be
        ## handled correctly. The first two will be processed bp, whilst
        ## the next two will be written bp. Using the above code, they should
        ## just appear as Read 1 and Read 2 for each set.
        dups <- grepl("^Read [12]$", nm)
        nm[dups] <- paste(
            nm[dups],
            "basepairs",
            rep(
                c("processed", "written"), each = sum(dups)/2
            )
        )
        nm <- tolower(nm)
        nm <- str_remove_all(nm, "[\\(\\)]")
        nm <- str_replace_all(nm, " ", "_")
        names(vals) <- nm
        out[["summary"]] <- as_tibble(as.list(vals))
        out[["summary"]][["header"]] <- list(hdr)
        out[["summary"]] <- dplyr::select(
            out[["summary"]], "header", everything()
        )

        ## Now the adapters
        out[grep("adapter", names(out))] <- lapply(
            x[grepl("adapter", names(x))],
            function(a){
                ## Avoid R CMD errors by declaring dummy variables
                A <- C <- G <- `T` <- c()
                main <- tibble(
                    sequence = str_extract(
                        a[grepl("Sequence", a)], "[ACGTN]+"
                    ),
                    type = str_replace_all(
                        a[grepl("Type", a)],
                        ".+Type: (.+); Length.+",
                        "\\1"
                    ),
                    length = as.integer(
                        str_replace_all(
                            a[grepl("Length", a)],
                            ".+Length: (.+); Trimmed.+",
                            "\\1"
                        )
                    ),
                    A = as.numeric(
                        str_extract(a[grepl("A:", a)], "[0-9\\.]+")
                    ) / 100,
                    C = as.numeric(
                        str_extract(a[grepl("C:", a)], "[0-9\\.]+")
                    ) / 100,
                    G = as.numeric(
                        str_extract(a[grepl("G:", a)], "[0-9\\.]+")
                    ) / 100,
                    `T` = as.numeric(
                        str_extract(a[grepl("T:", a)], "[0-9\\.]+")
                    ) / 100,
                    `none/other` = round(1 - (A + C + G + `T`), 2)
                )
                overview <- .splitByTab(
                    a[seq(which(grepl("^Overview", a)) + 1, length(a))]
                )
                overview <- dplyr::mutate_if(
                    overview,
                    vapply(
                        overview,
                        function(x){
                            suppressWarnings(all(!is.na(as.numeric(x))))
                        },
                        logical(1)
                    ),
                    as.numeric
                )
                main[["overview"]] <- list(as_tibble(overview))
                main
            }
        )

        ## Return the required module
        out[[which]]
    }

    caOut <- lapply(data, .parseCutAdaptSingle, which = which)
    caOut <- lapply(names(caOut), function(x){
        x <- dplyr::mutate(caOut[[x]], Filename = x)
        dplyr::select(x, Filename, everything())
    })
    ## Now bind all tibbles & returns
    bind_rows(caOut)

}

#' @title Parse data from featureCounts summary files
#' @description Parse data from featureCounts summary files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... Not used
#' @return tibble
#' @keywords internal
#' @importFrom forcats fct_inorder
#' @importFrom tidyselect ends_with
.parseFeatureCountsLogs <- function(data, ...){

    out <- lapply(data, function(x){
        x <- .splitByTab(x)
        x <- tidyr::pivot_longer(
            data = x,
            cols = ends_with("bam"),
            names_to = "Sample",
            values_to = "Total"
        )
        x$Sample <- x$Sample
        x$Total <- as.integer(x$Total)
        x$Status <- fct_inorder(x$Status)
        tidyr::pivot_wider(
            data = x,
            id_cols = "Sample",
            names_from = "Status",
            values_from = "Total"
        )
    })

    out <- lapply(names(data), function(x){
        out[[x]]$Filename = x
        dplyr::select(out[[x]], Filename, everything())
    })
    bind_rows(out)
}

#' @title Parse data from BUSCO log files
#' @description Parse data from BUSCO log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... Not used
#' @return data.frame
#' @keywords internal
.parseBuscoLogs <- function(data, ...){

    ## This will have already been through the validity checks
    df <- lapply(data, function(x){
        x <- gsub("# ", "", x)
        ## get numbers of each feild from lines
        single <-
            grep("Complete and single-copy BUSCOs \\(S\\)", x = x, value = TRUE)
        single <- as.integer(gsub(".*\t(.+)\tComp.*", "\\1", single))

        duplicated <-
            grep("Complete and duplicated BUSCOs \\(D\\)", x = x, value = TRUE)
        duplicated <- as.integer(gsub(".*\t(.+)\tComp.*", "\\1", duplicated))

        fragmented <- grep("Fragmented BUSCOs \\(F\\)", x = x, value = TRUE)
        fragmented <- as.integer(gsub(".*\t(.+)\tFrag.*", "\\1", fragmented))

        missing <- grep("Missing BUSCOs \\(M\\)", x = x, value = TRUE)
        missing <- as.integer(gsub(".*\t(.+)\tMiss.*", "\\1", missing))

        total <- sum(single, duplicated, fragmented, missing)

        name <- grep(
            "Summarized benchmarking in BUSCO notation for file ",
            x = x,
            value = TRUE
        )
        name <- gsub(".*file ", "\\1", name)
        df <- tibble(
            name = name,
            completeSingleCopy = single,
            completeDuplicated = duplicated,
            fragmented = fragmented,
            missing = missing
        )
    })
    df <- dplyr::bind_rows(df)

}

#' @title Parse data from trimmomatic log files
#' @description Parse data from trimmomatic log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... not used
#' @return tibble
#' @keywords internal
#' @importFrom tidyselect everything starts_with contains
.parseTrimmomaticLogs <- function(data, ...){

    .parseTrimmoSingle <- function(x){

        ## Initialise values which may or may not be present
        Input_Reads <- Input_Read_Pairs <- Surviving <- Both_Surviving <- NA
        Forward_Only_Surviving <- Reverse_Only_Surviving <- NA

        readType <- gsub(".+(PE|SE).+", "\\1", x[[1]])
        Illumina_Clip <- ifelse(
            grepl("ILLUMINACLIP", x[[2]]),
            gsub(".+ILLUMINACLIP:([^ ]+).+", "\\1", x[[2]]),
            NA
        )
        Leading <- ifelse(
            grepl("LEADING", x[[2]]),
            as.integer(gsub(".+LEADING:([0-9:]+).+", "\\1", x[[2]])),
            NA_integer_
        )
        Trailing <- ifelse(
            grepl("TRAILING", x[[2]]),
            as.integer(gsub(".+TRAILING:([0-9:]+).+", "\\1", x[[2]])),
            NA_integer_
        )
        Crop <- ifelse(
            grepl(" CROP", x[[2]]),
            as.integer(gsub(".+ CROP:([0-9:]+).+", "\\1", x[[2]])),
            NA_integer_
        )
        Head_Crop <- ifelse(
            grepl("HEADCROP", x[[2]]),
            as.integer(gsub(".+HEADCROP:([0-9:]+).+", "\\1", x[[2]])),
            NA_integer_
        )
        Sliding_Window <- ifelse(
            grepl("SLIDINGWINDOW", x[[2]]),
            gsub(".+SLIDINGWINDOW:([0-9:]+).+", "\\1", x[[2]]),
            NA
        )
        Min_Len <- ifelse(
            grepl("MINLEN", x[[2]]),
            gsub(".+MINLEN:([0-9:]+).+", "\\1", x[[2]]),
            NA
        )
        Max_Info <- ifelse(
            grepl("MAXINFO", x[[2]]),
            gsub(".+MAXINFO:([^ ]+).+", "\\1", x[[2]]),
            NA
        )
        Avg_Qual <- ifelse(
            grepl("AVGQUAL", x[[2]]),
            as.integer(gsub(".*AVGQUAL:([0-9]+).*", "\\1", x[[2]])),
            NA_integer_
        )
        Quality_Encoding <- grep("Quality encoding", x, value = TRUE)
        Quality_Encoding <-
            gsub("Quality encoding detected as ", "", Quality_Encoding)

        ## Get the line with the summary values
        valLine <- x[[length(x) - 1]]

        if (readType == "SE") {

            Input_Reads <- gsub("Input Reads: ([0-9]+).+", "\\1", valLine)
            Input_Reads <- as.integer(Input_Reads)

            Surviving <- gsub(".+Surviving: ([0-9]+).+", "\\1", valLine)
            Surviving <- as.integer(Surviving)

        }
        if (readType == "PE") {

            Input_Read_Pairs <-
                gsub("Input Read Pairs: ([0-9]+).+", "\\1", valLine)
            Input_Read_Pairs <- as.integer(Input_Read_Pairs)

            Both_Surviving <-
                gsub(".+Both Surviving: ([0-9]+).+", "\\1", valLine)
            Both_Surviving <- as.integer(Both_Surviving)

            Forward_Only_Surviving <-
                gsub(".+Forward Only Surviving: ([0-9]+).+", "\\1", valLine)
            Forward_Only_Surviving <- as.integer(Forward_Only_Surviving)

            Reverse_Only_Surviving <-
                gsub(".+Reverse Only Surviving: ([0-9]+).+", "\\1", valLine)
            Reverse_Only_Surviving <- as.integer(Reverse_Only_Surviving)
        }
        Dropped <- gsub(".+Dropped: ([0-9]+).+", "\\1", valLine)
        Dropped <- as.integer(Dropped)

        tibble(
            Type = readType,
            Input_Reads,
            Input_Read_Pairs,
            Surviving,
            Both_Surviving,
            Forward_Only_Surviving,
            Reverse_Only_Surviving,
            Dropped,
            Illumina_Clip,
            Sliding_Window,
            Max_Info,
            Leading,
            Trailing,
            Crop,
            Head_Crop,
            Min_Len,
            Avg_Qual,
            Quality_Encoding
        )

    }

    out <- lapply(data, .parseTrimmoSingle)
    out <- dplyr::bind_rows(out)
    out$Filename <- names(data)

    ## Many of the above values may be missing.
    ## Remove them if so using a quick tidy
    value <- c() # Avoiding an R CMD check NOTE
    out <- tidyr::gather(out, "key", "value", -1)
    out <- dplyr::filter(out, !is.na(value))
    out <- tidyr::spread(out, "key", "value")

    ## Return the final output
    dplyr::select(
        out,
        "Filename",
        "Type",
        starts_with("Input"),
        contains("Surviving"),
        everything()
    )

}

#' @title Parse data from BUSCO log files
#' @description Parse data from BUSCO log files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... Not used
#' @return data.frame
#' @keywords internal
.parseQuastLogs <- function(data, ...){

    ## This will have already been through the validity checks
    df <- lapply(data, function(x){
        x <- gsub("# ", "", x)
        ## get Assembly stats from lines
        totalL <- grep("Total length\t", x = x, value = TRUE)
        totalL <- as.integer(gsub(".*\t(.+)", "\\1", totalL))

        N50 <- grep("N50\t", x = x, value = TRUE)
        N50 <- as.integer(gsub(".*\t(.+)", "\\1", N50))

        N75 <- grep("N75\t", x = x, value = TRUE)
        N75 <- as.integer(gsub(".*\t(.+)", "\\1", N75))

        L50 <- grep("L50\t", x = x, value = TRUE)
        L50 <- as.integer(gsub(".*\t(.+)", "\\1", L50))

        L75 <- grep("L75\t", x = x, value = TRUE)
        L75 <- as.integer(gsub(".*\t(.+)", "\\1", L75))

        longest <- grep("Largest contig\t", x = x, value = TRUE)
        longest <- as.integer(gsub(".*\t(.+)", "\\1", longest))

        df <- tibble(
            totalLength = totalL,
            longestTig = longest,
            N50 = N50,
            N75 = N75,
            L50 = L50,
            L75 = L75
        )
    })
    df <- dplyr::bind_rows(df)

    dplyr::bind_cols(tibble(fileNames = names(data)), df)

}

#' @title Parse data from samtools flagstat files
#' @description Parse data from samtools flagstat files
#' @details Checks for structure will have been performed
#' @param data List of lines read using readLines on one or more files
#' @param ... Not used
#' @return data.frame
#' @keywords internal
.parseFlagstatLogs <- function(data, ...){

    .parseFlagstatSingle <- function(data){
        df <- gsub("([0-9]+) \\+ ([0-9]+) (.+)", "\\1\t\\2\t\\3", data)
        df <- .splitByTab(df, FALSE)
        colnames(df) <- c("QC-passed", "QC-failed", "flag")
        df[["flag"]] <- gsub("([a-z ]+) \\(.+", "\\1", df[["flag"]])
        df[["flag"]][nrow(df)] <-
            paste(df[["flag"]][nrow(df)], "(mapQ>=5)")
        df[["QC-passed"]] <- as.integer(df[["QC-passed"]])
        df[["QC-failed"]] <- as.integer(df[["QC-failed"]])
        df
    }

    out <- lapply(data, .parseFlagstatSingle)
    out <- lapply(names(out), function(x){
        out[[x]]$Filename <- x
        out[[x]]
    })
    out <- bind_rows(out)
    dplyr::select(out, "Filename", everything())


}

Try the ngsReports package in your browser

Any scripts or data that you put into this service are public.

ngsReports documentation built on Nov. 23, 2020, 2:01 a.m.