coproanalysis: Analyze CoPRO and Related Data

Documented in identifyTSNs identifyTSNsR

#' Identify TSNs
#'
#' NOTE: This function is currently inactive and invisible to users of the pacakge. We may re-instate this function later, but for now, use identifyTSNsR to identify TSNs.
#'
#' This function takes the results of a CoPRO experiment, stored in a data.table, and returns the set of TSNs present in the data. We use the following criteria to determine which nucleotides are TSNs:
#' 1. 3+ pause positions and 1+ total capped reads
#' 2. Reads > 55 nt are > 25% capped
#' 3. >= 15 % of reads that start within 300 bp upstream or downstream are capped (to be implemented)
#'
#' @param dat a data.table containing results of a CoPRO experiment
#' @param nDistinctPause number of distinct pause sites required
#' @param nCap number of capped reads required
#' @param widthTresh the transcript length after which pLongCappedTresh of reads must be capped
#' @param pLongCappedTresh the fraction of reads greater than widthTresh in length that must be capped
#' @param factorDownsizeDataForPlot the factor by which the size of the data is (randomly) reduced in generating the plot
#' @param plotMe logical value indicating whether plots should be generated
#'
#' @return a vector containing the nucleotides (indexed by ID5) that qualify as TSNs according to the above criteria.

identifyTSNs <- function(dat, nDistinctPause = 3, nCap = 1, widthTresh = 55, pLongCappedTresh = 0.25, factorDownsizeDataForPlot = 4, plotMe = TRUE) {
  # Color pallete
  heatcols <- c('white', "#A7A7A7", "dodgerblue", "forestgreen","gold", "firebrick")

  # 1. We find the number of distinct pause positions and capped reads for each ID5
  cond1 <- dat[, list(nPause = .N, nCappedReads = sum(C)), by = ID5]
  # We plot a representative random sample of the rows of cond1
  if (plotMe) plot(ggplot(data = cond1[sample(1:nrow(cond1), nrow(cond1)/factorDownsizeDataForPlot)][nPause <= 40], aes(x = nPause, y = log2(nCappedReads + 1))) + geom_hex() + scale_y_continuous(name = "log(N capped reads + 1)") + xlab("N distinct pause sites") + scale_fill_gradientn(name = "", colors = heatcols, trans='log10') + geom_vline(xintercept = nDistinctPause, col = "red") +  geom_hline(yintercept = log(nCap + 1), col = "red"))
  # Finally, store the ID5s that satisfy condition 1
  ID5C1 <- cond1[nPause >= nDistinctPause & nCappedReads >= nCap, ID5]
  rm(cond1); invisible(gc())

  # 2. For each ID5 with width exceeding 55, we compute the fraction of reads >= 55nt that are capped.
  cond2 <- dat[width >= widthTresh][, list(pLongCapped = sum(C) / sum(tot)), by = ID5]
  # We make a density plot of fraction of transcripts > 55 nt that are capped
  if (plotMe) plot(ggplot(data = cond2[sample(1:nrow(cond2), nrow(cond2)/factorDownsizeDataForPlot)], mapping = aes(x = pLongCapped)) + geom_density(fill = "lightblue") + geom_vline(xintercept = pLongCappedTresh, col = "red"))
  # We isolate those tsns (by ID5) that satisfy condition 2: Either the maximum width < 55 nt, or > 25% of those reads >= 55 nt are capped
  ID5C2.1 <- dat[, list(maxWidth = max(width)), by = ID5][maxWidth <= widthTresh, ID5]
  ID5C2.2 <- cond2[pLongCapped > pLongCappedTresh, ID5]
  rm(cond2); invisible(gc())
  ID5C2 <- union.Vector(ID5C2.1, ID5C2.2)
  rm(ID5C2.1, ID5C2.2); invisible(gc())
  ######################
  # 3. To be implemented
  ######################
  out <- intersect.Vector(ID5C1, ID5C2)
  return(out)
}

#' Identify TSNs using the R column
#'
#' This function is similar to the identifyTSNs function. The difference is that the capped + uncapped library (the R column) is used to define TSNs. We have the following requirements.
#' 1. 3 + pause positions.
#' 2. 5 + total reads.
#' @param dat a data.table containing results of a CoPRO experiment
#' @param nDistinctPauseThresh number of distinct pause sites required
#' @param nReadsThresh number of total cappped + uncapped (R) reads required
#'
#' @return a vector containing the nucleotides (indexed by ID5) that qualify as TSNs according to the above criteria.
#' @export

identifyTSNsR <- function(dat, nDistinctPauseThresh = 3, nReadsThresh = 5, plotMe = FALSE) {
  # We find the number of distinct pause positions and capped reads for each ID5
  cond1 <- dat[R > 0, list(nDistinctPause = .N, nReads = sum(R)), by = ID5]
  # We plot a representative random sample of the rows of cond1
  if (plotMe) {
    heatcols <- c('white', "#A7A7A7", "dodgerblue", "forestgreen","gold", "firebrick")
    ggplot(data = cond1[nDistinctPause <= 40], aes(x = nDistinctPause, y = log(nReads))) + geom_hex() + scale_y_continuous(name = "log(reads)") + xlab("N distinct pause sites") + scale_fill_gradientn(name = "", colors = heatcols, trans='log10') + geom_vline(xintercept = nDistinctPauseThresh, col = "red") +  geom_hline(yintercept = log(nReadsThresh), col = "red")
  }
  # Finally, store the ID5s that satisfy condition 1
  ID5C <- cond1[nDistinctPause >= nDistinctPauseThresh & nReads >= nReadsThresh, ID5]
  return(ID5C)
}