R/05-makeFinalTable.R

# vRNA fragment processing in order to determine which RNA species
# are "full length" RNAs (vRNAs) and which are the result of internal
# deletions (DIs).  A tibble with metadata is then returned.

#' Produce a final output table of DIs
#'
#' @param vRNATable A tibble produced by the makevRNATable function
#' @param sequences An RNAStringSetList produced by the vRNASeqs function
#' @param outType A character variable to select the desired output from the function
#' @param sequenceLengths A named integer vector, usually derived from seqlengths(BSGenomeObj)
#' @return A tibble containing only candidate DIs
#' @import dplyr
#' @importFrom rlang .data
#' @export
#'
makeDItable <- function(vRNATable = NULL,
                        sequences = NULL,
                        outType = "All",
                        sequenceLengths = NULL) {

# Do some checking on user parameters to make sure they are sane
  if (!checkmate::test_tibble(vRNATable) ||
      base::nrow(vRNATable) < 1) {
    stop(
      "Check that the input vRNATable is correctly formatted.
      The input table should be a tibble that has at least one row."
    )
  } else {
    if (!checkmate::test_class(sequences, "RNAStringSetList") ||
        length(sequences) < 1)
    {
      stop(
        "Check that the input sequence object is correctly formatted.
        The input table should be a RNAStringSetList that has at least one row."
      )
    }
  }
  if (!checkmate::test_choice(outType, c("DI", "Full", "All"))) {
      stop(
        "You must specify an output type.  The options are DI, Full or All."
      )
  }

  (sequences <- unlist(sequences))
  seqchar <- vector("character", length(sequences))
  seqchar <- as.character(sequences)

  if (!identical(names(seqchar), vRNATable$filepath_and_index)) {
    stop("Sequence object names DO NOT MATCH those found in
         vRNATable.  Something is wrong!")
  }

  # Insert sequences into the table BEFORE splitting.
  vRNATable$sequences <- unname(seqchar)
  fullLength <- vRNATable[vRNATable$RNA_type == "vRNA",]
  fragments <- vRNATable[vRNATable$RNA_type == "Frag",]
  # List of indices in he fullLength table that have components in the fragments tabel
  seqv <- seq(from = 1, to = nrow(fullLength))
  seqIndices <- purrr::map(seqv, .makeSeqIndices, fl = fullLength, fr = fragments)
  # Retain only needed columns before extraction of fragment info
  fragments <- dplyr::select(fragments, .data$start, .data$end, .data$width, .data$sequences)
  fragments <-
    dplyr::rename(
      fragments,
      Start = .data$start,
      End = .data$end,
      Sequences = .data$sequences,
      Width = .data$width
    )
  # Text to append to "Fragment_x" columns which will be added to final table
  columnsToAdd <- c("_Start", "_End", "_Length", "_Sequence")
  # Store column types in name attribute
  names(columnsToAdd) <-
    c("numeric", "numeric", "numeric", "character")
  columnSeq <- purrr::map(seqIndices, seq_along)
  if (max(unlist(columnSeq)) > 2) {
    cat(
      "***Warning, DIs with greater than 2 component fragments detected.
      Although this is not necessarily an incorrect result, it is HIGHLY unusual.
      Proceed with caution.***"
    )
  }
  # Need to call closures that are in a list
  callFunc <- function(xx, ...) xx(...)

  maxColumnSeq <-
    seq(from = 1, to = max(purrr::map_int(seqIndices, length)))
  functionList <- purrr::map(maxColumnSeq, .createColumns, textv = columnsToAdd)

  columnNames <- purrr::map(functionList, callFunc)
  # Here we can create lists of vectors that contain row indices from the total fragment table
  # This approach allows for the possibility of RNAs with more than two component fragments
  # and is efficient.
  fragIndices <- purrr::map(maxColumnSeq, .fill, sI = seqIndices)
  fillerFunc <- function(x) {
    fragList <- vector("list", length = 1)
    fragList[[x]] <- fragments[fragIndices[[x]], ]
  }
  # Now we just pull the fragments and place them in one of the tibbles in our list
  fragmentInfo <- purrr::map(maxColumnSeq, fillerFunc)
  fragmentInfoCombined <- dplyr::bind_cols(fragmentInfo)
  names(fragmentInfoCombined) <- unlist(columnNames)
  # Now we just bind everything together, and then add a "Delimited_Sequence"
  # column so that we can add a separator between the individual component fragment
  # sequences.
  diTableCombined <- dplyr::bind_cols(fullLength, fragmentInfoCombined)

  diTableCombined <- dplyr::mutate(
    diTableCombined,
    Complete_Sequence = purrr::map_chr(seqIndices,
                                       ~ paste(fragments$Sequences[.x], collapse = "")),
    Delimited_Sequence = purrr::map_chr(seqIndices,
                                        ~ paste(fragments$Sequences[.x], collapse = "/"))
  ) %>%
    dplyr::mutate(Complete_Sequence_Length = purrr::map_dbl(.data$Complete_Sequence, nchar))
  # Rename some columns for clarity
  diTableCombined <-
    dplyr::rename(
      diTableCombined,
      Ref_Width = .data$width,
      Ref_Start = .data$start,
      Ref_End = .data$end,
      Origin_Strand = .data$strand
    )
  # Determine the Subtype of each RNA using .classifyRNA
  diTableCombined <-
    dplyr::mutate(
      diTableCombined,
      RNA_subType = purrr::map_chr(
        seq(from = 1, to = nrow(diTableCombined)),
        ~ .classifyRNA(diTableCombined[.x,], sequenceLengths)

      )
    )
  # Now we modify the RNA type based on the presence of a second fragment
  diTableCombined$RNA_type <- dplyr::case_when(
    !is.na(diTableCombined$Fragment_2_Length) ~ "DI Candidate",
    is.na(diTableCombined$Fragment_2_Length) ~ "Full Length vRNA Candidate"
  )
  # Final polishing of table to remove any undeeded columns before returning results.
  diTableFinal <- diTableCombined %>%
    select(-sequences)
  # Now tailor the output according to the "outType" selection.
   if (outType == "DI") {
     diTableCombined <- dplyr::filter(diTableCombined, .data$RNA_type == "DI Candidate")
     cat("Returning only candidate DI RNAs")
     return(diTableCombined)
   } else if (outType == "Full") {
     diTableCombined <- dplyr::filter(diTableCombined, .data$RNA_type == "Full Length vRNA Candidate")
     cat("Returning only candidate full length RNAs")
     return(diTableCombined)
   } else if (outType == "All") {
     cat("Returning both DI and full length candidate RNAs")
     return(diTableCombined)
   }

  }
mdurbanowski/digR documentation built on June 10, 2019, 1:27 a.m.