R/addAnnotationToTSR.R
In TSRchitect: Promoter identification from large-scale TSS profiling data

#' @title \strong{addAnnotationToTSR}
#' @description \code{addAnnotationToTSR} associates an identified promoter
#' with a given gene, if found upstream and on the same strand within
#' a specified range.
#'
#' @param experimentName an object of class \emph{tssObject} with occupied
#' data slots \emph{@@tsrData} (and/or \emph{@@tsrDataMerged}).
#' The \emph{tssObject} must alrady have an annotation attached to the slot
#' \emph{@@annotation}, which is provided by either
#' \code{\link{importAnnotationExternal}} or
#' \code{\link{importAnnotationHub}}.
#' @param tsrSetType Specifies the type of TSR set to be processed.
#' Options are "replicates" or "merged".
#' @param tsrSet Number of the data set (of type \emph{tsrSetType}) to be
#' processed. (numeric)
#' @param upstreamDist the maximum distance (in bp) upstream of the selected
#' interval necessary to associate a TSR with a given annotation. (numeric)
#' @param downstreamDist the maximum distance (in bp) downstream of the start
#' of the selected interval to associate a TSR with a given annotation.
#' (numeric)
#' @param feature Specifies the feature to be used for annotation
#' (typically "gene" [default] or "mRNA" for GFF3 input); set to "all" if all
#' annotations from the input are to be used. (character)
#' @param featureColumnID Name of the column identifier in the
#' \linkS4class{GRanges} annotation object. This should be "ID" (default) for
#' GFF3 input or "name" for bed input. (character)
#' @param writeTable logical, specifying whether the output should be written
#' to a tab-delimited file. Defaults to TRUE.
#`
#' @return addAnnotationToTSR adds feature annotation to the (merged)
#' \emph{@@tsrData} data frame and returns the updated \emph{tssObject}.
#'
#' @import BiocGenerics
#' @importFrom GenomicRanges GRanges findOverlaps promoters
#' @importFrom IRanges IRanges
#' @importFrom utils write.table
#'
#' @examples
#' load(system.file("extdata", "tssObjectExample.RData",
#' package="TSRchitect"))
#' tssObjectExample <- addAnnotationToTSR(experimentName=tssObjectExample,
#' tsrSetType="merged", tsrSet=1, upstreamDist=1000, downstreamDist=200,
#' feature="transcript", featureColumnID="ID", writeTable=FALSE)
#' #if the object attached to @@annotation is a gff/gff3 file
#'
#' @note An example similar to the this one can be found
#' in the vignette (/inst/doc/TSRchitect.Rmd)
#'
#' @export
#' @rdname addAnnotationToTSR-methods


setGeneric("addAnnotationToTSR",
    function(experimentName, tsrSetType, tsrSet=1, upstreamDist,
             downstreamDist, feature, featureColumnID, writeTable=TRUE)
    standardGeneric("addAnnotationToTSR")
)

#' @rdname addAnnotationToTSR-methods

setMethod("addAnnotationToTSR",
          signature(experimentName="tssObject", "character", "numeric",
                    upstreamDist="numeric", downstreamDist="numeric",
                    feature="character", featureColumnID="character",
                    "logical"),
          function(experimentName, tsrSetType, tsrSet, upstreamDist=1000,
                   downstreamDist=200, feature="gene", featureColumnID="ID",
                   writeTable=TRUE) {

              message("... addAnnotationToTSR ...")
              if (tsrSetType=="replicates") {
                  if (tsrSet>length(experimentName@tsrData)) {
                      stop("The value selected for tsrSet exceeds the number ",
                           "of slots in tsrData.")
                  }
                  outfname <- paste("TSRset-", tsrSet, sep="")
                  outfname <- paste(outfname, "txt", sep=".")
                  message("\nThe TSR set for TSS dataset ", tsrSet,
                          " will be written to file ", outfname,
                          "\nin your working directory.")
                  tsr.df <- experimentName@tsrData[[tsrSet]]
                  tsr.df$start <- as.numeric(as.character(tsr.df$start))
                  tsr.df$end <- as.numeric(as.character(tsr.df$end))
              } else if (tsrSetType=="merged") {
                  if (length(experimentName@tsrDataMerged)<1) {
                      stop("The @tsrDataMerged slot is currently empty.\n",
                          "Please complete the merger before continuing.")
                  }
                  if (tsrSet>length(experimentName@tsrDataMerged)) {
                      stop("The value selected for tsrSet exceeds the ",
                           "number of slots in tsrDataMerged.")
                  }
                  if (tsrSet<length(experimentName@tssCountDataMerged)) {
                      outfname <- paste("TSRsetMerged-", tsrSet, sep="")
                      outfname <- paste(outfname, "txt", sep=".")
                      message("\nThe merged TSR set for TSS dataset ",
                              tsrSet, " will be written to file ", outfname,
                              "\nin your working directory.")
                  } else { # "combined" case
                      outfname <- "TSRsetCombined.txt"
                      message("\nThe combined TSR set derived from all samples",
                              " will be written to file ", outfname,
                              "\nin your working directory.")
                  }
                  tsr.df <- experimentName@tsrDataMerged[[tsrSet]]
                  tsr.df$start <- as.numeric(as.character(tsr.df$start))
                  tsr.df$end <- as.numeric(as.character(tsr.df$end))
              } else {
                  stop("Error: argument tsrSetType to addAnnotationToTSR()",
                       "should be either \"replicates\" or \"merged\".")
              }

#  ... loading the annotion GRange object and pulling out what we want:
              allAnnotation <- experimentName@annotation
              if (length(allAnnotation)<1) {
                  stop("No annotation has been loaded to the tssObject.",
                       "\n\nPlease import an annotation prior to using ",
                       "addAnnotationToTSR.")
              }
#  ... pulling out the selected feature from the annotation GRanges
#  object (if the required, GFF3-standard, "type" column exists):
              if ( feature == "all"  || is.na(match("type",
                       names(allAnnotation@elementMetadata@listData))) ) {
                  annot.gr <- allAnnotation
              } else {
                  annot.gr <- allAnnotation[allAnnotation$type==feature, ]
              }

              # ... creating a GRanges object from the data frame tsr.df:
              tsr.gr <- makeGRangesFromDataFrame(tsr.df,
                                                 keep.extra.columns=FALSE,
                                                 ignore.strand=FALSE,
                                                 seqnames.field="seq",
                                                 start.field="start",
                                                 end.field="end",
                                                 strand.field="strand"
                                                 )

#  ... defining the regions of interest for annotation.
#  Typically this would be the predicted promoter regions based on
#  the gene annotion.
#  promoters() takes the 5'-start at x of annot.gr, then creates an interval
#  x-upstreamDist,x+downstreamDist-1 on the plus strand
#  or x-downstreamDist+1,x+upstreamDist on the minus strand; in short
#-  a potential promoter region.
              regionOfInterest <- promoters(annot.gr, upstream=upstreamDist,
                                            downstream=downstreamDist)
              ID.vec <- S4Vectors::mcols(regionOfInterest)[featureColumnID]
              ID.vec <- ID.vec[featureColumnID]
              ID.vec <- as.character(unlist(ID.vec))
              overlapHitList <- findOverlaps(tsr.gr, regionOfInterest)
#  ... overlapHitList is a hit list that indicates the overlaps
#  between tsr.gr entries and regionOfInterest entries:
              overlap.df <- as.data.frame(overlapHitList)

#...  adding featureID to tsr.df:
#
              tsr.df$featureID <- NA #seeding the data frame (with NAs,
#  which will represent no overlap after the next line of code is complete

#  now replacing the NAs in the queryHits locations with the featureColumnID
#  entries of corresponding subject = regionOfInterest entries:

              tsr.df$featureID[overlap.df$queryHits] <-
              ID.vec[overlap.df$subjectHits]
              rownames(tsr.df) <- paste(tsr.df$seq, tsr.df$start, tsr.df$end,
                                        tsr.df$strand, sep=".")
#writing the table to a file if writeTable=TRUE 
              if (writeTable==TRUE) {
                  write.table(format(tsr.df,scientific=FALSE), file=outfname,
                              col.names=TRUE, row.names=FALSE, sep="\t",
                              quote=FALSE)
                  message("\nThe updated TSR data have been written to ",
                          "file ", outfname, " in your working directory.")
              }
              #Update the record:
              if (tsrSetType=="replicates") {
                  experimentName@tsrData[[tsrSet]] <- tsr.df
              } else {
                  experimentName@tsrDataMerged[[tsrSet]] <- tsr.df
              }

              message("Done. GeneIDs have been associated with adjacent",
                      " TSRs.\n")
              message("-----------------------------------------------------\n")
              message(" Done.\n")
              return( experimentName)
          }
          )