#' Read JACUSA2 result file
#'
#' \code{read_result()} reads data that was generated by JACUSA2 and creates a JACUSA2 result object.
#'
#' @param file String that represents the file name of the JACUSA2 output or an URL.
#' @param cond_desc Vector of strings that represent names/descriptions for conditions.
#' @param unpack Boolean or vector of fields to be unpacked.
#' @param tmpdir String @see `data.table::fread`.
#' @param showProgress Boolean @see `data.table::fread`.
#' @param ... parameters forwarded to `data.table::fread`.
#'
#' @importFrom magrittr %>%
#'
#' @export
read_result <- function(file, cond_desc = c(), unpack = FALSE, tmpdir = tempdir(), showProgress = FALSE, ...) {
# if url download first
if (grepl("ftp://|http://", file)) {
# partly adopted from data.table::fread
tmpFile <- tempfile(tmpdir = tmpdir)
if (!requireNamespace("curl", quietly = TRUE))
stop("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.")
tmpFile = tempfile(fileext = paste0(".", tools::file_ext(file)), tmpdir = tmpdir)
curl::curl_download(file, tmpFile, mode = "wb", quiet = !showProgress)
file = tmpFile
on.exit(unlink(file), add = TRUE)
}
# pre process file
# parse comments ^(#|##) to determine result/method type and number of conditions
fun <- ifelse(grepl('\\.gz$', file), base::gzfile, base::file)
con <- fun(file, "r")
cond_count <- 0
if (length(cond_desc) > 0) {
cond_count <- length(cond_desc)
}
skip_lines <- 0
header_names <- NULL
jacusa_header <- c()
# possible result/method types: unknown, call-pileup, rt-arrest, lrt-arrest
type <- .UNKNOWN_METHOD
while (TRUE) {
line = readLines(con, n = 1)
# quit reading: nothing to read or first no header line
if (length(line) == 0 || length(grep("^#", line)) == 0) {
break
}
# count header lines to ignore
skip_lines <- skip_lines + 1
if (length(grep("^#contig", line)) > 0) {
# try to guess result/method by header line
type <- .guess_file_type(line)
# type will be valid or .guess_file_type throw error
# parse and store header
# fix header: #contig -> contig
header_names <- sub("^#", "", line);
header_names <- unlist(base::strsplit(header_names, split = "\t", fixed = TRUE))
# guess number of conditions
guessed_cond_count <- .guess_cond_count(type, header_names)
if (cond_count > 0) {
if (guessed_cond_count != cond_count) {
stop("Length of description(",
cond_count,
") and conditions(", guessed_cond_count, ") don't match.")
}
}
cond_count <- guessed_cond_count
} else if (length(grep("^##", line)) > 0) {
jacusa_header <- c(gsub("^##", "", line), jacusa_header)
}
}
# finished pre-processing
close(con)
# check that a header could be parsed
if (is.null(header_names)) {
stop("No header line for file: ", file)
}
# check that conditions could be guessed
if (cond_count < 1) {
stop("Conditions could not be guessed for file: ", file)
}
# read data
data <- data.table::fread(
file,
skip = skip_lines,
sep = "\t",
header = FALSE,
...
)
colnames(data) <- header_names
# convert to numeric
i <- data[, 5] == .EMPTY
if (any(i)) {
data[i, 5] <- NA
data[, 5] <- as.numeric(data[, 5])
}
# create result data depending on determined method type
data <- .create_result(type, cond_count, data)
# unpack info field
if (!is.null(unpack)) {
if (is.logical(unpack) && unpack) {
prefixes <- .type2prefix(type)
prefix <- prefixes[1]
ins_keys <- c(paste0("ins", .extract_cond_repl(header_names, prefix)), "insertion_score", "insertion_pvalue")
del_keys <- c(paste0("del", .extract_cond_repl(header_names, prefix)), "deletion_score", "deletion_pvalue")
unpack_keys <- c(
ins_keys, del_keys,
"seq",
"arrest_score",
paste0("reset", c(1:cond_count, "P")),
paste0("backtrack", c(1:cond_count, "P"))
)
} else {
unpack_keys <- unpack
}
unpacked_info <- unpack_info(data$info, cond_count, unpack_keys)
if (!is.null(unpacked_info)) {
GenomicRanges::mcols(data) <- cbind(GenomicRanges::mcols(data), unpacked_info)
}
}
# TODO remove type stuff and move to RangedExperimentResult
attr(data, .ATTR_TYPE) <- type
if (length(cond_desc)) {
attr(data, .ATTR_COND_DESC) <- cond_desc
}
attr(data, .ATTR_HEADER) <- jacusa_header
data
}
# Create result for type and conditions from data
.create_result <- function(type, cond_count, data) {
cols <- NULL
if (type == .CALL_PILEUP) {
cols <- c(.CALL_PILEUP_COL)
} else if (type %in% c(.RT_ARREST, .LRT_ARREST)) {
cols <- c(.ARREST_COL, .THROUGH_COL)
} else {
stop("Unknown type: ", type)
}
data <- .unpack_cols(data, cols, cond_count, .BASES)
l <- list(
seqnames = data$contig,
ranges = IRanges::IRanges(start = data$start + 1, end = data$end),
strand = GenomicRanges::strand(gsub("\\.", "*", data$strand))
)
gr <- do.call(GenomicRanges::GRanges, l)
GenomicRanges::mcols(gr) <- data[, setdiff(colnames(data), c("contig", "start", "end", "strand"))]
# process arrest and through columns
if (type %in% c(.RT_ARREST, .LRT_ARREST)) {
GenomicRanges::mcols(gr)[[.CALL_PILEUP_COL]] <- mapply_repl(
Reduce,
GenomicRanges::mcols(gr)[[.ARREST_COL]],
GenomicRanges::mcols(gr)[[.THROUGH_COL]],
MoreArgs = list("f" = "+")
)
gr <- add_arrest_rate(gr)
}
# add coverage
GenomicRanges::mcols(gr)[[.COV]] <- coverage(GenomicRanges::mcols(gr)[[.CALL_PILEUP_COL]])
gr
}
#' Merged coordinate information
#'
#' Merge contig:start|start-end:strand from AJCUSA2 result object
#'
#' @param result object created by \code{read_result()} or \code{read_results()}.
#' @return merged coorindates information
#'
#' @export
id <- function(result) {
paste0(
GenomicRanges::seqnames(result),
":",
GenomicRanges::start(result),
"-",
GenomicRanges::end(result),
":",
GenomicRanges::strand(result)
)
}
#' Observed base calls.
#'
#' Observed base calls.
#'
#' @param bases tibble of base call counts
#' @return vector of observed base calls
#' @export
base_call <- function(bases) {
apply(bases > 0, 1, function(x) {
paste0(names(which(x)), collapse = "")
}
)
}
# match columns from df that are prefixed with types
.matches <- function(df, prefixes, cond_count) {
# regex parts:
prefix_regex = paste0("^(", paste0(prefixes, collapse = "|"), ")")
cond_regex = paste0("([0-9]{", nchar(cond_count), "})")
repl_regex = "([0-9]+)"
# get relevant cols and disect to m(atch):
# match, base_type, condition, replicate
cols <- grep(paste0(prefix_regex, cond_regex, repl_regex), names(df), value = TRUE)
m <- do.call(
rbind,
regmatches(names(df), regexec(paste0(prefix_regex, cond_regex, repl_regex), names(df)))
)
col_names <- c("col", "prefix", "cond", "repl")
if (ncol(m) != length(col_names)) {
m <- matrix(character(), 0, length(col_names))
}
colnames(m) <- c("col", "prefix", "cond", "repl")
matches <- as.data.frame(m, stringsAsFactors = FALSE)
matches
}
# replace empty with actual values
.fill_empty <- function(df, cols, new_cols) {
new_value <- paste0(rep(0, length(new_cols)), collapse = ",")
for (col in cols) {
i <- df[, col] == .EMPTY | df[, col] == ""
if (length(i)) {
df[i, col] <- new_value
}
}
df
}
#' Get keys of info field.
#'
#' Get keys of info field.
#'
#' @param info Vector of strings.
#' @return returndfs a vector of available keys.
#'
#' @export
get_info_keys <- function(info) {
keys <- base::strsplit(gsub("(;?[^=;]+)=[^=;]+", "\\1", info), ";", fixed = TRUE) %>% unlist() %>% unique()
return(setdiff(keys, .EMPTY))
}
#' Unpack info field
#'
#' Unpacks info field.
#'
#' @param info Vector of stings.
#' @param cond_count integer Number of conditions.
#' @param keys vector of string of keys to extract from "info".
#' @return returns a JACUSA2 result object with "info" unpacked.
#'
#' @export
unpack_info <- function(info, cond_count, keys=get_info_keys(info)) {
if (is.null(keys) || length(keys) == 0) {
return(NULL)
}
info <- fast_unpack_info(info, names = keys) %>% as.data.frame()
for (col in c(paste0("reset", c(1:cond_count, "P")), paste0("backtrack", c(1:cond_count, "P")))) {
if (col %in% colnames(info)) {
i <- info[, col] == ""
info[i, col] <- NA
}
}
# FIXME remove
if ("read_sub" %in% colnames(info)) {
colnames(info)[colnames(info) == .SUB_TAG_COL]
warning(paste0("Renamed deprecated field 'read_sub' to '", .SUB_TAG_COL, "'"))
}
col2type <- c("arrest_score" = as.numeric, "seq" = NULL)
cols <- names(info)
for (col in names(col2type)) {
if (col %in% cols) {
if (all(info[[col]] == "")) {
info[col] <- NULL
} else if (!is.null(col2type[[col]])) {
i <- info[[col]] == ""
if (length(i)) {
info[i, col] <- NA
}
info[[col]] <- col2type[[col]](info[[col]])
}
}
}
# parse indels on demand
info <- .unpack_indels(info, cond_count)
info
}
# expand strings from vector s
.unpack <- function(s, new_cols) {
df <- lapply(data.table::tstrsplit(s, split = ",", fixed = TRUE), as.numeric) %>%
as.data.frame() %>%
tidyr::as_tibble()
colnames(df) <- new_cols
df
}
# expand cols
.unpack_cols <- function(df, prefixes, cond_count, new_cols) {
matches = .matches(df, prefixes, cond_count)
cols <- unique(matches$col)
df <- .fill_empty(df, cols, new_cols)
unpacked <- lapply(df[cols], .unpack, new_cols = new_cols)
df <- dplyr::select(df, -dplyr::one_of(cols)) %>% tidyr::as_tibble()
for (prefix in prefixes) {
i <- matches$prefix == prefix
if (any(i)) {
merged <- .merge_cond(unpacked[matches[i, "col"]], matches[i, ])
df[[prefix]] <- tidyr::as_tibble(merged)
}
}
df
}
#
.merge_cond <- function(df, matches) {
conds <- paste0("cond", matches$cond)
unique_conds <- unique(conds)
merged <- tapply(df, conds, tidyr::as_tibble, simplify = FALSE)
for (cond in unique_conds) {
cond_data <- merged[[cond]]
i <- match(names(cond_data), matches$col)
names(cond_data) <- paste0("rep", matches[i, "repl"])
merged[[cond]] <- cond_data
}
names(merged) <- unique_conds
merged <- c(merged)
merged
}
# helper function to extract deletion info(s)
.unpack_indels <- function(info, cond_count) {
info <- tibble::as_tibble(info)
for (operation in c("insertion", "deletion")) {
op <- substr(operation, 0, 3)
score <- paste0(operation, "_score")
pvalue <- paste0(operation, "_pvalue")
for (col in c(score, pvalue)) {
if (all(info[[col]] == "")) {
info[[col]] <- NULL
} else {
info[[col]] <- as.numeric(info[[col]])
}
}
matches = .matches(info, c(op), cond_count)
cols <- unique(matches$col)
if (length(cols) > 0) {
new_cols <- c("reads", "coverage")
df <- .fill_empty(info, cols, new_cols)
unpacked <- lapply(df[cols], .unpack, new_cols = new_cols)
for (col in cols) {
info[[col]] <- NULL
}
i <- matches$prefix == op
if (any(i)) {
merged <- .merge_cond(unpacked[matches[i, "col"]], matches[i, ])
info[[op]] <- tidyr::as_tibble(merged)
}
}
}
info
}
#' Read multiple related JACUSA2 results
#'
#' Read multiple related JACUSA2 results.
#'
#' @param files Vector of files.
#' @param meta_conds Vector of string vectors that explain files.
#' @param cond_descs Vector of strings that represent names/descriptions for conditions.
#' @param unpack Boolean indicates if info column should be processed.
#' @export
read_results <- function(files, meta_conds, cond_descs = replicate(length(files), c()), unpack = FALSE) {
stopifnot(length(files) == length(meta_conds))
# read all files
l <- mapply(function(file, meta_cond, cond_desc) {
result <- read_result(file, cond_desc, unpack)
type <- attr(result, .ATTR_TYPE)
attr(result, .ATTR_TYPE) <- NULL
attr(result, .ATTR_COND_DESC) <- NULL
GenomicRanges::mcols(result)[[.META_COND_COL]] <- meta_cond
return(list(result, type))
}, files, meta_conds, cond_descs, SIMPLIFY = FALSE)
types <- lapply(l, "[[", 2) %>% unlist(use.names = FALSE)
# combine read files
#results <- unlist(as(lapply(l, "[[", 1), "GRangesList"))
results <- unlist(GenomicRanges::GRangesList(lapply(l, "[[", 1)))
results$meta_cond <- as.factor(results$meta_cond)
attr(results, .ATTR_TYPE) <- types
attr(results, .ATTR_COND_DESC) <- cond_descs
results
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.