R/divTest.R

Defines functions divTest

Documented in divTest

#' @rdname divTest
#' @title Group Comparisons of Information Divergences Based on Generalized
#'     Linear Model
#' @description Generalized Linear Model for group comparison of information
#'     divergence variables yielded by function 
#'     \code{\link[MethylIT]{estimateDivergence}} from MethylIT R package output.
#'     Basically, this a wrapping function to perform the fitting of generalized
#'     linear models with \code{\link[stats]{glm}} from 'stats' package to any
#'     variable of interest given in GRanges objects of 
#'     \code{\link[MethylIT]{estimateDivergence}} output.
#' @details The default parameter setting glm.family = Gamma(link = "log") is
#'     thought to perform the group comparison of the sums of absolute
#'     differences of methylation levels (total variation distance (TVD) at
#'     gene-body DIMPs on DMGs). The sums of Hellinger divergence (HD, at
#'     gene-body DIMPs on DMGs) can be tested with this setting as well. Both
#'     TVD and HD follow asymptotic Chi-square distribution and, consequently,
#'     so do the sum of TVD and the sum of HD.  The Chi-square distribution is
#'     a particular case of Gamma distribution: \cr
#'         \deqn{f(x|a,s) = 1/(s^a Gamma(a)) x^(a-1) e^-(x/s)}
#'     Chi-square density is derived after replacing a = n/2 and s = 2: \cr
#'         \deqn{f(x|n) = 1/(2^(n/2) Gamma(n/2)) x^(n/2-1) e^(-x/2)}
#' @param GR GRanges objects including control and treatment samples containing
#'     an information divergence of methylation levels. The names for each
#'     column must coincide with the names given for parameters:
#'     'control.names' and 'treatment.names'.
#' @param control.names Names/IDs of the control samples, which must be
#'     included in the variable GR in a metacolumn.
#' @param treatment.names Names/IDs of the treatment samples, which must be
#'     included in the variable GR in a metacolumn.
#' @param glm.family,link Parameter to be passed to function
#'     \code{\link[stats]{glm}}. A description of the error distribution and
#'     link function to be used in the model. For \code{\link[stats]{glm}} this
#'     can be a character string naming a family function, or the result of a
#'     call to a family function. For \code{\link[stats]{glm}}.fit only the
#'     third option is supported. (See\code{\link[stats]{family}} function).
#'     Default: glm.family=Gamma(link ="log").
#' @param var.weights Logical (default: FALSE). Whether to use group variances
#'     as weights.
#' @param weights An optional list of two numeric vectors of ‘prior weights’ to
#'     be used in the fitting process. One vector of weights for the control
#'     and one for the treatment. Each vector with length equal to length(GR)
#'     (default: NULL). Non-NULL weights can be used to indicate that different
#'     observations have different dispersions (with the values in weights
#'     being inversely proportional to the dispersions).
#' @param varFilter Numeric (default: 0). GLM will be performed only for those
#'     rows (ranges denoting genomic regions) where the group variance is
#'     greater the number specified by varFilter.
#' @param meanFilter Numeric (default: 0). GLM will be performed only for those
#'     rows (ranges denoting genomic regions) where the absolute difference of
#'     group means is  greater the number specified by meanFilter.
#' @param FilterLog2FC if TRUE, the results are filtered using the minimun
#'     absolute value of log2FoldChanges observed to accept that a gene in the
#'     treatment is differentially expressed in respect to the control.
#' @param Minlog2FC minimum logarithm base 2 of fold changes
#' @param divPerBp At least for one group the mean divergence per bp must be
#'     equal to or greater than 'divPerBp' (default divPerBp = 0.001).
#' @param minInd Integer (Default: 2). At least one group must have 'minInd'
#'     individuals with a divergence value greater than zero.
#' @param pAdjustMethod Method used to adjust the results; default: "NULL"
#'     (see \code{\link[stats]{p.adjust}}.methods). The p-value adjustment is
#'     performed using function \code{\link[stats]{p.adjust}}.
#' @param scaling integer (default 1). Scaling factor estimate the
#'     signal density as: scaling * "DIMP-Count-Per-Bp". For example,
#'     if scaling = 1000, then signal density denotes the number of DIMPs in
#'      1000 bp.
#' @param pvalCutOff cutoff used then a p-value adjustment is performed
#' @param saveAll if TRUE all the temporal results that passed filters
#'     'varFilter' and are 'meanFilter' returned. If FALSE, only the
#'     comparisons that passed filters 'varFilter', 'meanFilter', and
#'     pvalue < pvalCutOff or adj.pvalue < pvalCutOff (if pAdjustMethod is not
#'     NULL) are returned.
#' @param num.cores The number of cores to use, i.e. at most how many child
#'     processes will be run simultaneously (see
#'     \code{\link[BiocParallel]{bplapply}} function from BiocParallel).
#' @param tasks integer(1). The number of tasks per job.  Value must be a scalar
#'     integer >= 0L. In this documentation a job is defined as a single call to
#'     a function, such as bplapply, bpmapply etc. A task is the division of the
#'     X argument into chunks. When tasks == 0 (default), X is divided as evenly
#'     as possible over the number of workers (see
#'     \code{\link[BiocParallel]{MulticoreParam-class}} from BiocParallel
#'     package).
#' @param verbose if TRUE, prints the function log to stdout
#' @param ... Additional parameters passed to \code{\link[stats]{glm}} function.
#' @importFrom stats glm Gamma anova
#' @importFrom genefilter rowVars
#' @importFrom BiocParallel MulticoreParam bplapply
#' @return The original GRanges object with the columns "beta", "log2FC",
#'     "pvalue", "adj.pval" (if pAdjustMethod requested), "CT.divPerBp" and
#'     "TT.divPerBp" (divergence per base pairs), and "divPerBpVariation added.
#' @examples
#' ## Gene annotation
#' genes <- GRanges(seqnames = "1",
#'                  ranges = IRanges(start = c(3631, 6788, 11649),
#'                                   end = c(5899, 9130, 13714)),
#'                  strand = c("+", "-", "-"))
#' mcols(genes) <- data.frame(gene_id = c("AT1G01010", "AT1G01020",
#'                                        "AT1G01030"))
#' # === The number of cytosine sites to generate ===
#' sites = 11001
#' # == Set a seed for pseudo-random number generation ===
#' set.seed(123)
#' alpha.ct <- 0.09
#' alpha.tt <- 0.2
#' # === Simulate samples ===
#' ref = simulateCounts(num.samples = 2, sites = sites, alpha = alpha.ct,
#'                    beta = 0.5, size = 50, theta = 4.5, sample.ids = "R1")
#'
#' # Control group
#' ctrl = simulateCounts(num.samples = 2, sites = sites, alpha = alpha.ct,
#'                        beta = 0.5, size = 50, theta = 4.5,
#'                        sample.ids = c("C1", "C2"))
#' # Treatment group
#' treat = simulateCounts(num.samples = 2, sites = sites, alpha = alpha.tt,
#'                         beta = 0.5, size = 50, theta = 4.5,
#'                         sample.ids = c("T1", "T2"))
#'
#' #  === Estime Divergences ===
#' HD = estimateDivergence(ref = ref$R1, indiv = c(ctrl, treat),
#'                         Bayesian = TRUE, num.cores = 1L, percentile = 1,
#'                         verbose = FALSE)
#'
#' nlms <- nonlinearFitDist(HD, column = 4, verbose = FALSE)
#'
#' ## Next, the potential signal can be estimated
#' PS <- getPotentialDIMP(LR = HD, nlms = nlms, div.col = 4, alpha = 0.05)
#'
#' ## The cutpoint estimation used to discriminate the signal from the noise
#' cutpoints <- estimateCutPoint(PS, control.names = c("C1", "C2"),
#'                               treatment.names = c("T1", "T2"),
#'                               div.col = 4, verbose = FALSE)
#' ## DIMPs are selected using the cupoints
#' DIMPs <- selectDIMP(PS, div.col = 9, cutpoint = min(cutpoints$cutpoint))
#'
#' ## Finally DIMPs statistics genes
#' tv_DIMPs = getGRegionsStat(GR = DIMPs, grfeatures = genes, stat = "sum",
#'                            absolute = TRUE, column = 7L)
#'
#' GR_tv_DIMP = uniqueGRanges(tv_DIMPs, type = "equal", chromosomes = "1")
#' colnames(mcols(GR_tv_DIMP)) <-  c("C1", "C2", "T1", "T2")
#'
#' res <- divTest(GR=GR_tv_DIMP, control.names =  c("C1", "C2"),
#'                treatment.names = c("T1", "T2"))
#' @export
divTest <- function(GR, control.names, treatment.names,
                   glm.family=Gamma(link="log"), var.weights = FALSE,
                   weights=NULL, varFilter=0, meanFilter=0, FilterLog2FC=TRUE,
                   Minlog2FC=1, divPerBp=0.001, minInd=2, pAdjustMethod=NULL,
                   scaling=1L, pvalCutOff=0.05, saveAll=FALSE, num.cores=1,
                   tasks=0L, verbose=TRUE, ...) {
   if (!inherits(GR, "GRanges")) {
       stop("* 'GR' must be an object from class 'GRanges'")
   }
   GR=GR[, c(control.names, treatment.names)]
   CT=as.matrix(mcols(GR[, control.names]))
   TT=as.matrix(mcols(GR[, treatment.names]))
   m=as.matrix(mcols(GR))
   # === Pre-filtering data === #
   g1=length(control.names); g2=length(treatment.names)
   # At least one group must have 'minInd' individuals with a divergence value
   # greater than zero.
   idx=which((rowSums(CT > 0) >= max(g1 - 1, minInd)) |
               (rowSums(TT > 0) >= max(g2 - 1, minInd)))
   m=m[idx,]; GR=GR[idx]; CT=CT[idx,]; TT=TT[idx,]
   # filtering based on group variance
   vars=rowVars(m)
   idx=which(vars > varFilter)
   m=m[idx,]; GR=GR[idx]; CT=CT[idx,]; TT=TT[idx,]
   ct.mean=rowMeans(CT)
   tt.mean=rowMeans(TT)
   mean.diff=abs(tt.mean - ct.mean)
   idx=which(mean.diff > meanFilter)
   m=m[idx,]; GR=GR[idx]; CT=CT[idx,]; TT=TT[idx,]

   if (!is.null(divPerBp)) {
     ## For each group the divergence per bp must be equal or greater
     ## than divPerBp
     size <- width(GR)
     CT.divPerBp=scaling * rowMeans(CT)/size
     TT.divPerBp=scaling * rowMeans(TT)/size
     idx <- ((CT.divPerBp >= divPerBp) | (TT.divPerBp >= divPerBp))
     m=m[idx,]; GR=GR[idx]; CT=CT[idx,]; TT=TT[idx,]
     CT.divPerBp=CT.divPerBp[idx]; TT.divPerBp=TT.divPerBp[idx]
   } else stop("*** The divergence signal must be divPerBp >= 0")

   if (verbose) {
       message("*** Number of genes after filtering divergences ", nrow(m))
   }
   # === weights === #
   wg=NULL
   if(var.weights==TRUE && is.null(weights)) {
       ct.var=rowVars(CT)
       tt.var=rowVars(TT)
       wg=cbind(1/(ct.mean + ct.var * ct.mean^2),
                1/(tt.mean + tt.var * tt.mean^2))
       rm(CT, TT, ct.var, tt.var); gc()
   }
   if(!is.null(weights)) {
     ct.var=weights[[1]]
     tt.var=weights[[2]]
     wg=cbind(ct.var, tt.var)
   }

   div = t(m); rm(m, vars, ct.mean, tt.mean, idx, mean.diff); gc()
   group <- factor(c(rep(1, length(control.names)),
                   rep(2, length(treatment.names))))
   group <- relevel(group, ref = "1")

   # ===== Auxiliar function to perform beta regression ===== #
   divglm <- function(x, grp, family, weights, ...) {
       model = tryCatch(glm(x ~ grp, family=family, weights=weights, ...),
               error=function(e) {
                   n=length(grp)
                   x=(x * (n - 1) + 0.5)/n
                   try(glm(x ~ grp, family=family, weights=weights, ...),
                       silent=TRUE)})
       if(!is.null(weights)) {
         n=as.vector(table(grp))
         w=c(rep(weights[1], n[1]), rep(weights[2], n[2]))
         rw = tryCatch(glm(x ~ grp, family=family, weights=w, ...),
                       error=function(e) {
                       n=length(grp)
                       x=(x * (n - 1) + 0.5)/n
                       try(glm(x ~ grp, family=family, weights=w, ...),
                           silent=TRUE)})
         if (!inherits(model, "try-error") && !inherits(rw, "try-error")) {
           midx=which.min(c(AIC(model), AIC(rw)))
           model=list(model, rw)[[midx]]
         }
         if (inherits(model, "try-error") && !inherits(rw, "try-error")) {
           model=rw
         }
       }

       if (!inherits(model, "try-error")) {
           beta=unname(coef(model)[2])
           log2FC=beta/log(2)
           pvalue=anova(model, test =  "Chisq")[2, 5]
       } else {
           beta=NA; log2FC=NA; pvalue=NA
       }
           return(c(beta=beta, log2FC=log2FC, pvalue=pvalue))
   }

   if (verbose)
       cat("* Performing generalized linear regression analysis ... \n")
   nc=ncol(div)
   if (num.cores > 1) {
       bpparam <- MulticoreParam(workers=num.cores, tasks=tasks)
       if (is.null(wg)) {
           res <- bplapply(1:nc, function(k) {
               divglm(x=div[,k], grp=group, family=glm.family, weights=NULL,
                      ...)},
               BPPARAM=bpparam)
       } else {
           res <- bplapply(1:nc, function(k) {
               divglm(x=div[,k], grp=group, family=glm.family,
                      weights=wg[k,], ...)}, BPPARAM=bpparam)
       }
       res=do.call(rbind, res)
   } else {
       res=matrix(NA, nrow=nc, ncol=3)
       if (is.null(wg)) {
           for (k in 1:nc) {
               res[k,]=divglm(x=div[,k], grp=group, family=glm.family,
                              weights=NULL)}
       } else {
           for (k in 1:nc) {
               res[k,]=divglm(x=div[,k], grp=group, family=glm.family,
                       weights=wg[k,])
           }
       }
       colnames(res) <- c("beta", "log2FC", "pvalue")
   }
   res = data.frame(res)
   res$CT.divPerBp <- CT.divPerBp
   res$TT.divPerBp <- TT.divPerBp
   res$divPerBpVariation <- (TT.divPerBp - CT.divPerBp)

   # === Filtering results ===
   if (FilterLog2FC) fidx <- which(abs(res$log2FC) > Minlog2FC)
   if (!is.null(pAdjustMethod)) {
       res$adj.pval < rep(NA, length(res$pvalue))
       # pvalue adjustment only for the selected comparisons: "fidx"
       if (FilterLog2FC) {
           res$adj.pval[fidx] <- p.adjust(res$pvalue[fidx],
                                  method=pAdjustMethod)
       } else
         # pvalue adjustment for the comparisons
         res$adj.pval <- p.adjust(res$pvalue, method=pAdjustMethod)
   }
   # Only results with "adj.pval < pvalCutOff" are reported if saveAll=FALSE
   if (!is.null(pvalCutOff) && !saveAll) {
      if (!is.null(pAdjustMethod)) {
           idx=which(res$adj.pval < pvalCutOff)
           res <- res[idx, ]
           GR=GR[idx]
      } else {
           idx=which(res$pvalue < pvalCutOff)
           res <- res[idx, ]
           GR=GR[idx]
      }
   }
   # Only results with "log2FC > Minlog2FC" are reported if saveAll=FALSE
   if (FilterLog2FC && !saveAll) {
       idx=which(abs(res$log2FC) > Minlog2FC)
       res <- res[idx, ]
       GR=GR[idx]
   }

  mcols(GR) <- data.frame(GR@elementMetadata, res)
  return(sortBySeqnameAndStart(GR))
}
genomaths/MethylIT.utils documentation built on Nov. 26, 2019, 2:10 a.m.