JACUSA2helper: Post-processing for JACUSA2 output

Documented in base_call get_info_keys id read_result read_results unpack_info

#' Read JACUSA2 result file
#'
#' \code{read_result()} reads data that was generated by JACUSA2 and creates a JACUSA2 result object.
#'
#' @param file String that represents the file name of the JACUSA2 output or an URL.
#' @param cond_desc Vector of strings that represent names/descriptions for conditions.
#' @param unpack Boolean or vector of fields to be unpacked.
#' @param tmpdir String @see `data.table::fread`.
#' @param showProgress Boolean @see `data.table::fread`.
#' @param ... parameters forwarded to `data.table::fread`.
#'
#' @importFrom magrittr %>%
#'
#' @export
read_result <- function(file, cond_desc = c(), unpack = FALSE, tmpdir = tempdir(), showProgress = FALSE, ...) {
  # if url download first
  if (grepl("ftp://|http://", file)) {
    # partly adopted from data.table::fread
    tmpFile <- tempfile(tmpdir = tmpdir)
    if (!requireNamespace("curl", quietly = TRUE)) 
      stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.")

    tmpFile = tempfile(fileext = paste0(".", tools::file_ext(file)), tmpdir = tmpdir)
    curl::curl_download(file, tmpFile, mode = "wb", quiet = !showProgress)
    file = tmpFile
    on.exit(unlink(file), add = TRUE)
  }

  # pre process file
  # parse comments ^(#|##) to determine result/method type and number of conditions  
  fun <- ifelse(grepl('\\.gz$', file), base::gzfile, base::file)
  con <- fun(file, "r")
  cond_count <- 0
  if (length(cond_desc) > 0) {
    cond_count <- length(cond_desc)
  }
  skip_lines <- 0
  header_names <- NULL
  jacusa_header <- c()
  # possible result/method types: unknown, call-pileup, rt-arrest, lrt-arrest
  type <- .UNKNOWN_METHOD
  while (TRUE) {
    line = readLines(con, n = 1)
    # quit reading: nothing to read or first no header line 
    if (length(line) == 0 || length(grep("^#", line)) == 0) {
      break
    }
    # count header lines to ignore
    skip_lines <- skip_lines + 1

    if (length(grep("^#contig", line)) > 0) {
      # try to guess result/method by header line
      type <- .guess_file_type(line)
      # type will be valid or .guess_file_type throw error

      # parse and store header
      # fix header: #contig -> contig
      header_names <- sub("^#", "", line);
      header_names <- unlist(base::strsplit(header_names, split = "\t", fixed = TRUE))
      
      # guess number of conditions  
      guessed_cond_count <- .guess_cond_count(type, header_names)
      if (cond_count > 0) {
        if (guessed_cond_count != cond_count) {
          stop("Length of description(", 
               cond_count, 
               ") and conditions(", guessed_cond_count, ") don't match.")
        }
      }
      cond_count <- guessed_cond_count
    } else if (length(grep("^##", line)) > 0) {
        jacusa_header <- c(gsub("^##", "", line), jacusa_header)
    }
    
  }
  # finished pre-processing
  close(con)
  
  # check that a header could be parsed
  if (is.null(header_names)) {
    stop("No header line for file: ", file)
  }

  # check that conditions could be guessed
  if (cond_count < 1) {
    stop("Conditions could not be guessed for file: ", file)
  }

  # read data
  data <- data.table::fread(
    file, 
    skip = skip_lines, 
    sep = "\t",
    header = FALSE, 
    ...
  )  
  colnames(data) <- header_names
  # convert to numeric
  i <- data[, 5] == .EMPTY
  if (any(i)) {
    data[i, 5] <- NA
    data[, 5] <- as.numeric(data[, 5])
  }
 
  # create result data depending on determined method type 
  data <- .create_result(type, cond_count, data)

  # unpack info field
  if (!is.null(unpack)) {
    if (is.logical(unpack) && unpack) {
      prefixes <- .type2prefix(type)
      prefix <- prefixes[1]
      ins_keys <- c(paste0("ins", .extract_cond_repl(header_names, prefix)), "insertion_score", "insertion_pvalue")
      del_keys <- c(paste0("del", .extract_cond_repl(header_names, prefix)), "deletion_score", "deletion_pvalue")
      
      unpack_keys <- c(
        ins_keys, del_keys,
        "seq",
        "arrest_score",
        paste0("reset", c(1:cond_count, "P")),
        paste0("backtrack", c(1:cond_count, "P"))
      )
    } else {
      unpack_keys <- unpack
    }

    unpacked_info <- unpack_info(data$info, cond_count, unpack_keys)
    if (!is.null(unpacked_info)) {
      GenomicRanges::mcols(data) <- cbind(GenomicRanges::mcols(data), unpacked_info)
    }
  }
  
  # TODO remove type stuff and move to RangedExperimentResult
  attr(data, .ATTR_TYPE) <- type
  if (length(cond_desc)) {
    attr(data, .ATTR_COND_DESC) <- cond_desc
  }
  attr(data, .ATTR_HEADER) <- jacusa_header

  data
}

# Create result for type and conditions from data 
.create_result <- function(type, cond_count, data) {
  cols <- NULL
  if (type == .CALL_PILEUP) {
    cols <- c(.CALL_PILEUP_COL)
  } else if (type %in% c(.RT_ARREST, .LRT_ARREST)) {
    cols <- c(.ARREST_COL, .THROUGH_COL)
  } else {
    stop("Unknown type: ", type)
  }
  data <- .unpack_cols(data, cols, cond_count, .BASES)

  l <- list(
    seqnames = data$contig,
    ranges = IRanges::IRanges(start = data$start + 1, end = data$end),
    strand = GenomicRanges::strand(gsub("\\.", "*", data$strand))
  )
  gr <- do.call(GenomicRanges::GRanges, l)
  GenomicRanges::mcols(gr) <- data[, setdiff(colnames(data), c("contig", "start", "end", "strand"))]
  
  # process arrest and through columns
  if (type %in% c(.RT_ARREST, .LRT_ARREST)) {
    GenomicRanges::mcols(gr)[[.CALL_PILEUP_COL]] <- mapply_repl(
      Reduce,
      GenomicRanges::mcols(gr)[[.ARREST_COL]], 
      GenomicRanges::mcols(gr)[[.THROUGH_COL]],
      MoreArgs = list("f" = "+")
    )
    gr <- add_arrest_rate(gr)
  }

  # add coverage
  GenomicRanges::mcols(gr)[[.COV]] <- coverage(GenomicRanges::mcols(gr)[[.CALL_PILEUP_COL]])

  gr
}

#' Merged coordinate information
#' 
#' Merge contig:start|start-end:strand from AJCUSA2 result object
#' 
#' @param result object created by \code{read_result()} or \code{read_results()}.
#' @return merged coorindates information
#' 
#' @export
id <- function(result) {
  paste0(
    GenomicRanges::seqnames(result),
    ":",
    GenomicRanges::start(result),
    "-",
    GenomicRanges::end(result),
    ":",
    GenomicRanges::strand(result)
  )
}

#' Observed base calls.
#' 
#' Observed base calls.
#' 
#' @param bases tibble of base call counts
#' @return vector of observed base calls
#' @export
base_call <- function(bases) {
  apply(bases > 0, 1, function(x) {
    paste0(names(which(x)), collapse = "") 
    }
  )
}

# match columns from df  that are prefixed with types
.matches <- function(df, prefixes, cond_count) {
  # regex parts:
  prefix_regex = paste0("^(", paste0(prefixes, collapse = "|"), ")")
  cond_regex = paste0("([0-9]{", nchar(cond_count), "})")
  repl_regex = "([0-9]+)"

  # get relevant cols and disect to m(atch):
  # match, base_type, condition, replicate
  cols <- grep(paste0(prefix_regex, cond_regex, repl_regex), names(df), value = TRUE)
  m <- do.call(
    rbind, 
    regmatches(names(df), regexec(paste0(prefix_regex, cond_regex, repl_regex), names(df)))
  )

  col_names <- c("col", "prefix", "cond", "repl")
  if (ncol(m) != length(col_names)) {
    m <- matrix(character(), 0, length(col_names))
  }
  colnames(m) <- c("col", "prefix", "cond", "repl")
  matches <- as.data.frame(m, stringsAsFactors = FALSE)

  matches
}

# replace empty with actual values
.fill_empty <- function(df, cols, new_cols) {
  new_value <- paste0(rep(0, length(new_cols)), collapse = ",")

  for (col in cols) {
    i <- df[, col] == .EMPTY | df[, col] == ""
    if (length(i)) {
      df[i, col] <- new_value
    }
  }

  df
}

#' Get keys of info field.
#' 
#' Get keys of info field.
#' 
#' @param info Vector of strings.
#' @return returndfs a vector of available keys.
#' 
#' @export
get_info_keys <- function(info) {
  keys <- base::strsplit(gsub("(;?[^=;]+)=[^=;]+", "\\1", info), ";", fixed = TRUE) %>% unlist() %>% unique()
  return(setdiff(keys, .EMPTY))
}

#' Unpack info field
#' 
#' Unpacks info field.
#' 
#' @param info Vector of stings.
#' @param cond_count integer Number of conditions.
#' @param keys vector of string of keys to extract from "info".
#' @return returns a JACUSA2 result object with "info" unpacked.
#' 
#' @export
unpack_info <- function(info, cond_count, keys=get_info_keys(info)) {
  if (is.null(keys) || length(keys) == 0) {
    return(NULL)
  }

  info <- fast_unpack_info(info, names = keys) %>% as.data.frame()

  for (col in c(paste0("reset", c(1:cond_count, "P")), paste0("backtrack", c(1:cond_count, "P")))) {
    if (col %in% colnames(info)) {
      i <- info[, col] == ""
      info[i, col] <- NA
    }
  }

  # FIXME remove
  if ("read_sub" %in% colnames(info)) {
    colnames(info)[colnames(info) == .SUB_TAG_COL]
    warning(paste0("Renamed deprecated field 'read_sub' to '", .SUB_TAG_COL, "'"))
  }

  
  col2type <- c("arrest_score" = as.numeric, "seq" = NULL)
  cols <- names(info)
  for (col in names(col2type)) {
    if (col %in% cols) {
      if (all(info[[col]] == "")) {
        info[col] <- NULL
      } else if (!is.null(col2type[[col]])) {
        i <- info[[col]] == ""
        if (length(i)) {
          info[i, col] <- NA
        }
        info[[col]] <- col2type[[col]](info[[col]])
      }
    }
  }
  # parse indels on demand
  info <- .unpack_indels(info, cond_count)

  info
}

# expand strings from vector s
.unpack <- function(s, new_cols) {
  df <- lapply(data.table::tstrsplit(s, split = ",", fixed = TRUE), as.numeric) %>% 
    as.data.frame() %>%
    tidyr::as_tibble()
  colnames(df) <- new_cols
  
  df
}

# expand cols
.unpack_cols <- function(df, prefixes, cond_count, new_cols) {
  matches = .matches(df, prefixes, cond_count)

  cols <- unique(matches$col)
  df <- .fill_empty(df, cols, new_cols)

  unpacked <- lapply(df[cols], .unpack, new_cols = new_cols)
  df <- dplyr::select(df, -dplyr::one_of(cols)) %>% tidyr::as_tibble()
  
  for (prefix in prefixes) {
    i <- matches$prefix == prefix
    if (any(i)) {
      merged <- .merge_cond(unpacked[matches[i, "col"]], matches[i, ])
      df[[prefix]] <- tidyr::as_tibble(merged)
    }
  }

  df
}

#
.merge_cond <- function(df, matches) {
  conds <- paste0("cond", matches$cond)
  unique_conds <- unique(conds)
  merged <- tapply(df, conds, tidyr::as_tibble, simplify = FALSE)
  
  for (cond in unique_conds) {
    cond_data <- merged[[cond]]
    i <- match(names(cond_data), matches$col)
    names(cond_data) <- paste0("rep", matches[i, "repl"])
    merged[[cond]] <- cond_data
  }
  names(merged) <- unique_conds
  merged <- c(merged)

  merged
}


# helper function to extract deletion info(s)
.unpack_indels <- function(info, cond_count) {
  info <- tibble::as_tibble(info)
  for (operation in c("insertion", "deletion")) {
    op <- substr(operation, 0, 3)
    score <- paste0(operation, "_score")
    pvalue <- paste0(operation, "_pvalue")
    
    for (col in c(score, pvalue)) {
      if (all(info[[col]] == "")) {
        info[[col]] <- NULL
      } else {
        info[[col]] <- as.numeric(info[[col]])
      }
    }

    matches = .matches(info, c(op), cond_count)
    cols <- unique(matches$col)
    if (length(cols) > 0) {
      new_cols <- c("reads", "coverage")
      df <- .fill_empty(info, cols, new_cols)
      unpacked <- lapply(df[cols], .unpack, new_cols = new_cols)
      for (col in cols) {
        info[[col]] <- NULL
      }
      
      i <- matches$prefix == op
      if (any(i)) {
        merged <- .merge_cond(unpacked[matches[i, "col"]], matches[i, ])
        info[[op]] <- tidyr::as_tibble(merged)
      }
    }
  }

  info
}
  
#' Read multiple related JACUSA2 results
#' 
#' Read multiple related JACUSA2 results.
#' 
#' @param files Vector of files.
#' @param meta_conds Vector of string vectors that explain files.
#' @param cond_descs Vector of strings that represent names/descriptions for conditions.
#' @param unpack Boolean indicates if info column should be processed.
#' @export
read_results <- function(files, meta_conds, cond_descs = replicate(length(files), c()), unpack = FALSE) {
  stopifnot(length(files) == length(meta_conds))
  
  # read all files  
  l <- mapply(function(file, meta_cond, cond_desc) {
    result <- read_result(file, cond_desc, unpack)
    
    type <- attr(result, .ATTR_TYPE)
    attr(result, .ATTR_TYPE) <- NULL
    attr(result, .ATTR_COND_DESC) <- NULL

    GenomicRanges::mcols(result)[[.META_COND_COL]] <- meta_cond
    return(list(result, type))
  }, files, meta_conds, cond_descs, SIMPLIFY = FALSE)

  types <- lapply(l, "[[", 2) %>% unlist(use.names = FALSE)

  # combine read files
  #results <- unlist(as(lapply(l, "[[", 1), "GRangesList"))
  results <- unlist(GenomicRanges::GRangesList(lapply(l, "[[", 1)))
  results$meta_cond <- as.factor(results$meta_cond)

  attr(results, .ATTR_TYPE) <- types
  attr(results, .ATTR_COND_DESC) <- cond_descs

  results
}