R/topologyOnTranscripts.R

Defines functions topologyOnTranscripts

Documented in topologyOnTranscripts

#' @title Extracting the topology on transcripts
#' 
#' @description A function to extract meta-transcript topologies of the input GRanges object
#' 
#' @param x A \link{GRanges} object for the genomic ranges to be annotated.
#' @param txdb A \link{TxDb} or \link{EnsDb} object for the transcript annotation.
#' @param region_weights A numeric vector of length 3 indicating the weights for 5'UTR, CDS, and 3'UTR; default is \code{c(1/3,1/3,1/3)}.
#' @param ambiguityMethod If \code{ambiguityMethod} is \code{"mean"} (default), \code{"sum"}, \code{"min"}, or \code{"max"},
#' then the mean, sum, minimum, and maximum values of the multiple region mappings will be returned in the output value.
#' @param ignore.strand When set to \code{TRUE}, the strand information is ignored in the overlap calculations.
#' 
#' @return A numeric vector with the same length as x. 
#' 
#' @details The meta-tx topology is calculated based on the weighted sum of the relative position of \code{x} on exonic 5'UTR, CDS, and 3'UTR,
#' that is, the topology \code{= sum(c(rel_pos_5UTR, rel_pos_CDS, rel_pos_3UTR)*region_weights)}. 
#' The \code{rel_pos} for x instances not mapped to the region is equal to 1 or 0. Specifically,
#' \code{rel_pos} is 1 if \code{x} overlaps at a downstream region,
#' \code{rel_pos} is 0 if \code{x} overlaps at an upstream region. The topology values for \code{x} that is not mapped to any one of the 5'UTR, CDS, and 3'UTR will be set to \code{NA}.
#' 
#' The relative positions on a give region are directly extracted by the function \code{extractRegionRelativePosition()}.
#' 
#' @examples 
#' ## Load the TxDb object
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#' 
#' ## Define the query GRanges
#' set.seed(01)
#' 
#' query_gr <- GRanges(rep(c("chr1", "chr2"), c(5, 15)),
#'                 IRanges(c(sample(11874:12127, 5), sample(38814:41527, 15)), width=1),
#'                 strand=Rle(c("+", "-"), c(5, 15)))
#'                 
#' ## Extract the meta-tx topology values
#' topologyOnTranscripts(query_gr, txdb)
#' 
#' ## Visualize the logistic regression curve with B-splines on a m6A miCLIP classification data set
#' 
#' GSE63753_sysy <- readRDS(system.file("extdata", "GSE63753_sysy.rds", package = "WhistleR"))
#' 
#' GSE63753_sysy$topology <- topologyOnTranscripts(GSE63753_sysy, txdb)
#'
#' library(ggplot2)
#' 
#' ggplot(na.omit(as.data.frame(mcols(GSE63753_sysy))), aes(topology, target)) +
#'   geom_smooth(formula = y ~ splines::ns(x, 8), method = "glm", method.args = list(family = "binomial")) +
#'   geom_vline(xintercept = c(0.33, 0.66), linetype = 2) +
#'   geom_hline(yintercept = 0.5, linetype = 3) +
#'   geom_text(aes(x=x,y=y,label=text),
#'             data = data.frame(
#'                x = c(0.165, 0.495, 0.825),
#'                y = c(0.1, 0.1, 0.1),
#'                text = c("5'UTR","CDS", "3'UTR")
#'   )) +
#'   scale_x_continuous(breaks = c(0, 0.33, 0.66, 0.9)) + scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1)) +
#'   theme_classic() + labs(x = "meta-tx topology", y = "prob of m6A = 1", title = "LR fit with cubic splines")
#' @export
topologyOnTranscripts <- function(x, 
                                  txdb, 
                                  region_weights = c(1/3,1/3,1/3),
                                  ambiguityMethod = c("mean", "sum", "min", "max"),
                                  ignore.strand=FALSE){
  
  u5bytx <- EnsureUCSC(fiveUTRsByTranscript(txdb),x)
  topology <- extractRegionRelativePosition(x,
                                          u5bytx,
                                          ambiguityMethod=ambiguityMethod,
                                          nomapValue="NA",
                                          ignore.strand=ignore.strand)*region_weights[1]
  rm(u5bytx)
  cdsbytx <- EnsureUCSC(cdsBy(txdb, by = "tx"),x)
  cdsrps <- extractRegionRelativePosition(x,
                                        cdsbytx,
                                        ambiguityMethod=ambiguityMethod,
                                        nomapValue="NA",
                                        ignore.strand=ignore.strand)
  rm(cdsbytx)
  indx <- !is.na(cdsrps)
  topology[indx] <- cdsrps[indx]*region_weights[2] + region_weights[1]
  rm(indx,cdsrps)
  
  u3bytx <- EnsureUCSC(threeUTRsByTranscript(txdb),x)
  u3rps <- extractRegionRelativePosition(x,
                                       u3bytx,
                                       ambiguityMethod=ambiguityMethod,
                                       nomapValue="NA",
                                       ignore.strand=ignore.strand)
  rm(u3bytx)
  indx <- !is.na(u3rps)
  topology[indx] <- u3rps[indx]*region_weights[3] + region_weights[2] + region_weights[1]
  rm(indx,u3rps)
  
  return(topology)
}
ZW-xjtlu/WhistleR documentation built on March 13, 2021, 10:50 a.m.