R/identifyTss.R

Defines functions identifyTSSs identifyTSSonChromAndStrand

Documented in identifyTSSonChromAndStrand identifyTSSs

#' Identify TSSs
#'
#' @param dat Data.table object containing columns corresponding to chromosome, strand, and ID5. The ID5 should correspond to bases labelled as TSNs.
#' @param thresh the distance over which TSNs are clustered into TSSs
#'
#' @return NULL; the data.table object is modified in-place; a column corresponding to TSS ID, tssID, is appended to the passed-in data.table.
#' @export
#'
#' @examples
identifyTSSs <- function(dat, thresh = 60) {
  # Initiate column for Tss ids
  dat[,tssId := as.integer(0)]
  currStartIdx = 1
  # Do each strand and chromosome separately, as TSSs must exist on different strands / chromosomes
  for (currChrom in as.character(unique(dat$seqnames))) {
    for (currStrand in as.character(unique(dat$strand))) {
      currID5 <- dat[seqnames == currChrom & strand == currStrand, ID5]
      currTssIds <- identifyTSSonChromAndStrand(currID5, thresh, currStartIdx)
      dat[seqnames == currChrom & strand == currStrand, tssId := as.integer(currTssIds)]
      currStartIdx <- currTssIds[length(currTssIds)] + 1
    }
  }
}


#' Indentify TSS ids for given chromosome and strand
#'
#' @param pos the ordered vector of ID5 ids
#' @param thresh the distance over which TSNs are clustered into TSSs
#' @param startIdx the number used to start labelling the TSSs
#'
#' @export
#' @return a vector of TSS ids
#'
#' @examples
#' pos <- c(5, 6, 6, 8, 19, 21, 21, 23, 37, 40, 51, 80, 83)
#' thresh <- 5
#' groups <- identifyTSSonChromAndStrand(pos, thresh, 1)
#' plot(1:length(pos), pos, col = groups, pch = 19, xlab = "index", ylab = "position")
#' pos <- c(1, 20, 21, 22, 23, 50)
#' groups <- identifyTSSonChromAndStrand(pos, thresh, 1)
#' plot(1:length(pos), pos, col = groups, pch = 19, xlab = "index", ylab = "position")

identifyTSSonChromAndStrand <- function(pos, thresh, startIdx) {
  groupStartIdxs <- which(diff(pos) >= thresh) + 1
  groupStartIdxs <- c(1, groupStartIdxs, length(pos) + 1)
  groupLens <- diff(groupStartIdxs)
  nGroups <- length(groupLens)
  tssIds <- rep(startIdx:(startIdx + nGroups - 1), groupLens)
  return (tssIds)
}
Timothy-Barry/coproanalysis documentation built on Feb. 12, 2020, 7:33 a.m.