# vRNA fragment processing in order to determine which RNA species
# are "full length" RNAs (vRNAs) and which are the result of internal
# deletions (DIs). A tibble with metadata is then returned.
#' Produce a final output table of DIs
#'
#' @param vRNATable A tibble produced by the makevRNATable function
#' @param sequences An RNAStringSetList produced by the vRNASeqs function
#' @param outType A character variable to select the desired output from the function
#' @param sequenceLengths A named integer vector, usually derived from seqlengths(BSGenomeObj)
#' @return A tibble containing only candidate DIs
#' @import dplyr
#' @importFrom rlang .data
#' @export
#'
makeDItable <- function(vRNATable = NULL,
sequences = NULL,
outType = "All",
sequenceLengths = NULL) {
# Do some checking on user parameters to make sure they are sane
if (!checkmate::test_tibble(vRNATable) ||
base::nrow(vRNATable) < 1) {
stop(
"Check that the input vRNATable is correctly formatted.
The input table should be a tibble that has at least one row."
)
} else {
if (!checkmate::test_class(sequences, "RNAStringSetList") ||
length(sequences) < 1)
{
stop(
"Check that the input sequence object is correctly formatted.
The input table should be a RNAStringSetList that has at least one row."
)
}
}
if (!checkmate::test_choice(outType, c("DI", "Full", "All"))) {
stop(
"You must specify an output type. The options are DI, Full or All."
)
}
(sequences <- unlist(sequences))
seqchar <- vector("character", length(sequences))
seqchar <- as.character(sequences)
if (!identical(names(seqchar), vRNATable$filepath_and_index)) {
stop("Sequence object names DO NOT MATCH those found in
vRNATable. Something is wrong!")
}
# Insert sequences into the table BEFORE splitting.
vRNATable$sequences <- unname(seqchar)
fullLength <- vRNATable[vRNATable$RNA_type == "vRNA",]
fragments <- vRNATable[vRNATable$RNA_type == "Frag",]
# List of indices in he fullLength table that have components in the fragments tabel
seqv <- seq(from = 1, to = nrow(fullLength))
seqIndices <- purrr::map(seqv, .makeSeqIndices, fl = fullLength, fr = fragments)
# Retain only needed columns before extraction of fragment info
fragments <- dplyr::select(fragments, .data$start, .data$end, .data$width, .data$sequences)
fragments <-
dplyr::rename(
fragments,
Start = .data$start,
End = .data$end,
Sequences = .data$sequences,
Width = .data$width
)
# Text to append to "Fragment_x" columns which will be added to final table
columnsToAdd <- c("_Start", "_End", "_Length", "_Sequence")
# Store column types in name attribute
names(columnsToAdd) <-
c("numeric", "numeric", "numeric", "character")
columnSeq <- purrr::map(seqIndices, seq_along)
if (max(unlist(columnSeq)) > 2) {
cat(
"***Warning, DIs with greater than 2 component fragments detected.
Although this is not necessarily an incorrect result, it is HIGHLY unusual.
Proceed with caution.***"
)
}
# Need to call closures that are in a list
callFunc <- function(xx, ...) xx(...)
maxColumnSeq <-
seq(from = 1, to = max(purrr::map_int(seqIndices, length)))
functionList <- purrr::map(maxColumnSeq, .createColumns, textv = columnsToAdd)
columnNames <- purrr::map(functionList, callFunc)
# Here we can create lists of vectors that contain row indices from the total fragment table
# This approach allows for the possibility of RNAs with more than two component fragments
# and is efficient.
fragIndices <- purrr::map(maxColumnSeq, .fill, sI = seqIndices)
fillerFunc <- function(x) {
fragList <- vector("list", length = 1)
fragList[[x]] <- fragments[fragIndices[[x]], ]
}
# Now we just pull the fragments and place them in one of the tibbles in our list
fragmentInfo <- purrr::map(maxColumnSeq, fillerFunc)
fragmentInfoCombined <- dplyr::bind_cols(fragmentInfo)
names(fragmentInfoCombined) <- unlist(columnNames)
# Now we just bind everything together, and then add a "Delimited_Sequence"
# column so that we can add a separator between the individual component fragment
# sequences.
diTableCombined <- dplyr::bind_cols(fullLength, fragmentInfoCombined)
diTableCombined <- dplyr::mutate(
diTableCombined,
Complete_Sequence = purrr::map_chr(seqIndices,
~ paste(fragments$Sequences[.x], collapse = "")),
Delimited_Sequence = purrr::map_chr(seqIndices,
~ paste(fragments$Sequences[.x], collapse = "/"))
) %>%
dplyr::mutate(Complete_Sequence_Length = purrr::map_dbl(.data$Complete_Sequence, nchar))
# Rename some columns for clarity
diTableCombined <-
dplyr::rename(
diTableCombined,
Ref_Width = .data$width,
Ref_Start = .data$start,
Ref_End = .data$end,
Origin_Strand = .data$strand
)
# Determine the Subtype of each RNA using .classifyRNA
diTableCombined <-
dplyr::mutate(
diTableCombined,
RNA_subType = purrr::map_chr(
seq(from = 1, to = nrow(diTableCombined)),
~ .classifyRNA(diTableCombined[.x,], sequenceLengths)
)
)
# Now we modify the RNA type based on the presence of a second fragment
diTableCombined$RNA_type <- dplyr::case_when(
!is.na(diTableCombined$Fragment_2_Length) ~ "DI Candidate",
is.na(diTableCombined$Fragment_2_Length) ~ "Full Length vRNA Candidate"
)
# Final polishing of table to remove any undeeded columns before returning results.
diTableFinal <- diTableCombined %>%
select(-sequences)
# Now tailor the output according to the "outType" selection.
if (outType == "DI") {
diTableCombined <- dplyr::filter(diTableCombined, .data$RNA_type == "DI Candidate")
cat("Returning only candidate DI RNAs")
return(diTableCombined)
} else if (outType == "Full") {
diTableCombined <- dplyr::filter(diTableCombined, .data$RNA_type == "Full Length vRNA Candidate")
cat("Returning only candidate full length RNAs")
return(diTableCombined)
} else if (outType == "All") {
cat("Returning both DI and full length candidate RNAs")
return(diTableCombined)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.