R/logfile_helpers.R

Defines functions spread_and_rename_numeric_fields parse_numeric_fields parse_colon_separated_lines

Documented in parse_colon_separated_lines parse_numeric_fields spread_and_rename_numeric_fields

###############################################################################

# Helper functions for parsing alignment-tool logfiles

###############################################################################

#' spread_and_rename_numeric_fields
#'
#' @param        x             A dataframe containing two columns "field" and
#'   "value". The elements in "field" will be renamed according to the mapping
#'   in `fieldnames`
#' @param        fieldnames    A dataframe containing two columns: "expected"
#'   and "output".
#'
#' @return       A dataframe. The columns ordering corresponds to the "field"
#'   column in `x` (one column for each row in `x` and the columns are named
#'   according to the reformatting defined by `fieldnames`).
#'
#' @importFrom   dplyr         mutate
#' @importFrom   rlang         .data
#' @importFrom   tidyr         spread_
#' @include      utilities.R
#'

spread_and_rename_numeric_fields <- function(x, fieldnames) {
  if (!all(c("expected", "output") %in% colnames(fieldnames))) {
    stop("columns (expected, output) should be present in `fieldnames`")
  }

  if (!all(c("field", "value") %in% colnames(x))) {
    stop("columns (field, value) should be present in `x`")
  }

  if (any(duplicated(x$field))) {
    stop("The elements in `x$field` should be unique")
  }

  df <- dplyr::mutate(
    x,
    field = replace_with(
      .data[["field"]], fieldnames$expected, fieldnames$output,
      strict = TRUE
    )
  )

  tidyr::spread_(
    df,
    key_col = "field", value_col = "value"
  )[df$field]
}

#' parse_numeric_fields
#'
#' @param        x             A single character string. This should be
#'   newline-separated. Key-Value pairs are assumed to be colon-separated
#'   and any line that is, is converted into a key-value pair.
#'
#' @return       A dataframe. Contains columns "field" and "value" where
#'   the entries of \code{value} are the values found within the text \code{x}
#'   for the fields in \code{field}. The fieldnames _may_ be duplicated but are
#'   returned in the same order as they are observed in the logfile.
#'
#' @importFrom   dplyr        mutate
#' @importFrom   readr        parse_number
#' @importFrom   rlang        .data
#' @importFrom   stringr      str_subset   str_replace
#'

parse_numeric_fields <- function(x) {
  extract_numeric_lines <- function(x) {
    # split some text into distinct lines, and keep only those lines with a
    # numeric statistic on the RHS of a colon
    x %>%
      strsplit("\n") %>%
      unlist() %>%
      stringr::str_subset(":.*[[:digit:]].*")
  }

  reformat_numeric_lines <- function(x) {
    # drop trailing
    # - parenthesised, percent eg, "some_stat : 123 (0.5%)"
    # - `bp` basepair indicators eg, "Remaining read pairs : 123 bp"
    # - percent signs eg, "percentage_statistic : 98.1%"
    x %>%
      stringr::str_replace("\\([[:graph:]]*%\\)$", "") %>%
      stringr::str_replace("bp[[:blank:]]*$", "") %>%
      stringr::str_replace("%[[:blank:]]*$", "")
  }

  # `x` should be bare text, not split
  stopifnot(is.character(x) && length(x) == 1)

  x %>%
    extract_numeric_lines() %>%
    reformat_numeric_lines() %>%
    # convert into key-value (string -> string) pairs
    parse_colon_separated_lines() %>%
    # convert into key-value (string -> number) pairs
    dplyr::mutate(
      value = readr::parse_number(.data[["value"]])
    )
}

#' parse_colon_separated_lines
#'
#' Split each entry in a vector on the first colon. Strips flanking whitespace
#' from what remains. Returns the left-hand-side in the column `field` and the
#' right-hand-side in a column `value`.
#'
#' Not exported
#'
#' @param        x            Vector of colon-separated values.
#'
#' @return       `tibble` with two columns: 'field' and 'value'. Any
#'   leading/trailing whitespace is trimmed off both the field and value.
#'
#' @importFrom   dplyr         mutate_all
#' @importFrom   magrittr      %>%   set_colnames
#' @importFrom   stringr       str_split_fixed
#' @importFrom   tibble        as_tibble
#'

parse_colon_separated_lines <- function(x) {
  if (missing(x)) {
    stop(
      "character vector `x` should be defined in parse_colon_separated_lines"
    )
  }

  # die if any of the input vector lacks a colon
  stopifnot(all(grepl(":", x)))

  # Split on the first colon
  # Join the values into a two-column dataframe
  # Strip all leading or trailing whitespace:
  x %>%
    stringr::str_split_fixed(":", n = 2) %>%
    magrittr::set_colnames(c("field", "value")) %>%
    tibble::as_tibble() %>%
    dplyr::mutate_all(trimws)
}
russHyde/reeq documentation built on March 10, 2020, 5:36 a.m.