MethylIT.utils: MethylIT utility

Documented in dmpDensity

#' @rdname dmpDensity
#' @title Linear density of DMPs at a given genomic region
#' @description The linear density of DMPs in a given genomic region (GR) is
#'     defined according with the classical terminology in physics, i.e., as the
#'     measure of the physical quantity of any characteristic value per unit of
#'     length. In the current case, as the amount of DIMPs per nucleotide base.
#'
#' @details Since the number of DIMPs along the DNA sequence vary, the local
#'     density of DMPs \eqn{\rho_i} at a fixed interval \eqn{\Delta} l_i is
#'     defined by the quotient \eqn{\rho_i = \Delta DMP_i/\Delta l_i} is the
#'     amount of DIMPs at the fixed interval. Likewise the local density of
#'     non-DIMPs is defined as \eqn{\rho_i = \Delta nonDMP_i/\Delta l_i}.
#'     Notice that for a specified methylation context, e.g., CG,
#'     \eqn{\Delta CG_i - \Delta DMP_i}, where \eqn{\Delta CG} is the
#'     amount CG positions at the given interval. The linear densities are
#'     normalized as \eqn{\rho_i/\rho_max}, where \eqn{\rho_max} is the maximum
#'     of linear density found in a given GR.
#' @param GR A genomic GRanges object carrying the genomic region where the
#'     estimation of the DMP density will be accomplished.
#' @param cut.col Integer denoting the GR metacolumn where the decision variable
#'     about whether a position is DMP is located. Default cut.col = 1.
#' @param cutoff Cut value to decide wheter the value of the variable used to
#'     estimate the density is a DMP at each position. If missing, then
#'     cutoff is estimated as the first queantile greater than zero from the
#'     values given in the GR column \emph{cut.col}.
#' @param Chr A character string. Default NULL. If the GR object comprises
#'     several chromosomes, then one chromosome must be specified. Otherwise the
#'     density of first chromosome will be returned.
#' @param start.pos,end.pos Start and end positions, respectively, of the GR
#'     where the density of DMPs will be estimated. Default NULL. If NULL
#'     densities will be estimated for the whole GR and the specified
#'     chromosome.
#' @param int.size1,int.size2 The interval/window size where the density of DMP
#'     and no DMPs are computed. Default Null.
#' @param breaks Integer. Number of windows/intervals to split the GR. Deafult
#'     NULL. If provided, then it is applied to compute the densities of DMPs
#'     and no-DMPs. If 'int.size1', 'int.size2', and 'breaks' are NULL, then the
#'     breaks are computed as:
#'     \code{breaks <- min(150, max(start(x))/nclass.FD(start(x)),
#'     na.rm = TRUE)},
#'     where function \emph{nclass.FD} (\code{\link[grDevices]{nclass}}) applies
#'     Freedman-Diaconis algorithm.
#' @param scaling Logic value to deside whether to perform the scaling of the
#'     estimated density values or not. Default is TRUE.
#' @param plot Logic. Whether to produce a grahic or not. Default, plot = TRUE.
#' @param noDMP.dens Logic whether to produce the graphics for no-DMP density.
#'     Default is TRUE
#' @param xlabel X-axis label. Default \emph{xlabel = 'Coordinate'}.
#' @param ylabel Y-axis label. Default \emph{ylabel = 'Normalized density'}.
#' @param col.dmp Color for the density of DMPs in the graphic.
#' @param col.ndmp Color for the density of no DMPs in the graphic.
#' @param yintercept If plot == TRUE, this is the position for an horizantal
#'     line that intercept the y-axis. Default yintercept = 0.25.
#' @param col.yintercept Color for the horizantal line 'yintercept'. Default
#'     \emph{col.yintercept = 'blue'}
#' @param type.yintercept Line type for the horizantal line 'yintercept'.
#'     Default \emph{type.yintercept = 'dashed'}.
#' @param dig.lab integer which is used when labels are not given. It determines
#'     the number of digits used in formatting the break numbers.
#' @examples
#' set.seed(349)
#' ## An auxiliary function to generate simulated hypothetical values from a
#' ## variable with normal distribution
#' hypDT <- function(mean, sd, n, num.pos, noise) {
#'     h <- hist(rnorm(n, mean = mean, sd = sd), breaks = num.pos, plot = FALSE)
#'     hyp <- h$density * 60 + runif(length(h$density)) * noise
#'     return(hyp)
#' }
#'
#' ## To generate a matrix of values with variations introduced by noise
#' hyp <- hypDT(mean = 5, sd = 30, n = 10^5, noise = 4, num.pos = 8000)
#' ## A GRanges object is built, which will carries the previous matrix on its
#' ## meta-columns
#' l <- length(hyp)
#' starts <- seq(0, 30000, 3)[1:l]
#' ends <- starts
#' GR <- GRanges(seqnames = 'chr1', ranges = IRanges(start = starts,
#'                 end = ends))
#' mcols(GR) <- data.frame(signal = hyp)
#'
#' # If plot is TRUE a grphic is printed. Otherwise data frame is returned.
#' p <- dmpDensity(GR, plot = FALSE)
#'
#' # If ggplot2 package is installed, then graphic can customized using
#' # the returned data frame 'p':
#'
#' # library(ggplot2)
#' ## Auxiliar function to write scientific notation in the graphics
#' # fancy_scientific <- function(l) {
#' #   #'turn in to character string in scientific notation
#' #   l <- format( l, scientific = TRUE, digits = 1 )
#' #   l <- gsub('0e\\+00','0',l)
#' #   #'quote the part before the exponent to keep all the digits
#' #   l <- gsub('^(.*)e', ''\\1'e', l)
#' #   #'turn the 'e+' into plotmath format
#' #   l <- gsub('e', '%*%10^', l)
#' #   l <- gsub('[+]', '', l )
#' #   #'return this as an expression
#' #   parse(text=l)
#' # }
#' #
#' # max.pos = max(p$DMP.coordinate)
#' # ggplot(data=p) +
#' #   geom_line(aes(x=DMP.coordinate, y=DMP.density), color='red') +
#' #   geom_hline(aes(yintercept=0.25), linetype='dashed',
#' #              colour='blue', show.legend=FALSE ) +
#' #   geom_line(aes(x=coordinate, y=density), color='blue') +
#' #   xlab('Coordinate') + ylab('Normalized density') +
#' #   scale_y_continuous(breaks=c(0.00, 0.25, 0.50, 0.75, 1.00)) +
#' #   scale_x_continuous(breaks=c(0.00, 0.25 *max.pos, 0.50*max.pos,
#' #                               0.75*max.pos, max.pos),
#' #                      labels = fancy_scientific) +
#' #   expand_limits(y=0)
#' @importFrom GenomicRanges setdiff
#' @importFrom grDevices nclass.FD
#' @return If plot is TRUE will return a graphic with the densities of DMPs and
#'     and no DMPs. If plot is FALSE a data frame object with the density of
#'     DMPs and not DMPs will be returned.
#' @author Robersy Sanchez
#' @export

dmpDensity <- function(GR, column = 1, cut.col = 1, cutoff, Chr = NULL, 
    start.pos = NULL, end.pos = NULL, int.size1 = NULL, int.size2 = NULL, 
    breaks = NULL, scaling = TRUE, plot = FALSE, noDMP.dens = TRUE, 
    xlabel = "Coordinate", ylabel = "Normalized density", col.dmp = "red", 
    col.ndmp = "blue", yintercept = 0.25, col.yintercept = "magenta", 
    type.yintercept = "dashed", dig.lab = 3) {
    if (class(GR) != "GRanges") 
        stop("GR must be a 'GRanges' object")
    
    # Auxiliar function to write scientific notation in the graphics
    fancy_scientific <- function(l) {
        # turn in to character string in scientific notation
        l <- format(l, scientific = TRUE, digits = 1)
        l <- gsub("0e\\+00", "0", l)
        # quote the part before the exponent to keep all the digits
        l <- gsub("^(.*)e", "'\\1'e", l)
        # turn the 'e+' into plotmath format
        l <- gsub("e", "%*%10^", l)
        l <- gsub("[+]", "", l)
        # return this as an expression
        parse(text = l)
    }
    # Function to compute the density of DMPs
    points <- function(x, int.length = NULL, scale, dig.lab, breaks = NULL) {
        intv <- start(x)
        # Split the gene into fixex interval
        if (is.null(int.length) & is.null(breaks)) {
            breaks <- min(150, max(start(x))/nclass.FD(start(x)), 
                na.rm = TRUE)
        }
        if (!is.null(int.length)) 
            breaks <- round(length(x)/int.length)
        dmp <- table(cut(intv, breaks = breaks, dig.lab = dig.lab))
        # Middle interval position
        pos <- rowMeans(t(unname(sapply(gsub("[(]*[]]*", "", names(dmp)), 
            function(s) as.numeric(strsplit(s, split = ",")[[1]])))))
        dmp <- as.numeric(dmp)
        if (scaling) 
            dmp <- dmp/max(dmp, na.rm = TRUE)
        return(data.frame(pos, dmp))
    }
    
    if (!is.null(Chr)) 
        GR <- GR[seqnames(GR) == Chr, ] else {
        chr <- unique(seqnames(GR))
        if (length(chr) > 1) 
            GR <- GR[seqnames(GR) == chr[1], ]
    }
    
    if (!is.null(start.pos) && !is.null(end.pos)) {
        starts <- start(GR)
        if (end.pos > max(starts)) 
            stop("'end.pos' is greater than the higest coordinate in the GR")
        if (end.pos > max(starts)) 
            stop("'start.pos' is greater than the lowest coordinate in the GR")
        idx1 <- min(which(starts >= start.pos))
        idx2 <- max(which(start(GR) <= end.pos))
        GR <- GR[idx1:idx2]
    }
    
    if (missing(cutoff)) {
        qs <- summary(mcols(GR)[, 1])[2:6]
        cutoff <- qs[qs > 0][1]
    }
    
    idx <- (mcols(GR)[, cut.col] >= cutoff)
    x1 <- GR[idx, ]
    x2 <- GenomicRanges::setdiff(GR, x1, ignore.strand = TRUE)
    x1 <- points(x = x1, int.length = int.size1, scale = scale, dig.lab = dig.lab, 
        breaks = breaks)
    x2 <- points(x = x2, int.length = int.size2, scale = scale, dig.lab = dig.lab, 
        breaks = breaks)
    if (plot) {
        max.pos <- max(x1$pos)
        tick.pos <- c(0, 0.25 * max.pos, 0.5 * max.pos, 0.75 * max.pos, 
            max.pos)
        par(las = 1)
        plot(x1$pos, x1$dmp, type = "l", xlab = xlabel, xaxt = "n", 
            ylab = ylabel, col = col.dmp, ylim = c(0, 1))
        axis(1, at = tick.pos, label = fancy_scientific(tick.pos))
        if (noDMP.dens) 
            lines(x2$pos, x2$dmp, col = col.ndmp)
        abline(h = yintercept, col = col.yintercept, lty = type.yintercept)
    } else {
        return(data.frame(DMP.coordinate = x1$pos, DMP.density = x1$dmp, 
            coordinate = x2$pos, density = x2$dmp))
    }
}