R/propTest.R

Defines functions propTest

Documented in propTest

#' @rdname propTest
#' @title Beta Regression for methylation levels and rates
#' @description Beta Regression analysis for treatment versus control group
#'     comparison of methylation levels, appends three new metacolumns 'beta',
#'     'log2FC', 'pvalue' to the provided GRanges argument
#' @details Beta Regression analysis for group comparison of methylation levels
#'     is performed using the function \code{\link[betareg]{betareg}}.
#' @param GR GRanges objects including control and treatment samples containing
#'     the methylation levels. The name for each column must coincide with the
#'     names given for parameters: 'control.names' and 'treatment.names'.
#' @param control.names Names/IDs of the control samples, which must be include
#'     in the variable GR at the metacolumn.
#' @param treatment.names Names/IDs of the treatment samples, which must be
#'     included in the variable GR at the metacolumn.
#' @param link Parameter to be passed to function 'betareg' from package
#'     'betareg'. character specification of the link function in the mean model
#'     (mu). Currently, 'logit', 'probit', 'cloglog', 'cauchit', 'log',
#'     'loglog' are supported. Alternatively, an object of class 'link-glm'
#'     can be supplied (see \code{\link[betareg]{betareg}}).
#' @param type Parameter to be passed to function 'betareg' from package
#'     'betareg'. A character specification of the type of estimator. Currently,
#'     maximum likelihood ('ML'), ML with bias correction ('BC'), and ML with
#'     bias reduction ('BR') are supported.
#' @param tv.cut A cutoff for the total variation distance (TVD; absolute value
#'     of methylation levels differences) estimated at each site/range as the
#'     difference of the group means of methylation levels. If tv.cut is
#'     provided, then sites/ranges k with abs(TV_k) < tv.cut are removed before
#'     to perform the regression analysis. Its value must be NULL or a number
#'     0 < tv.cut < 1.
#' @param indvPerGrp An integer number giving the minimum number of individuals
#'     per group at each site/region. Default 2.
#' @param FilterLog2FC if TRUE, the results are filtered using the minimun
#'     absolute value of log2FoldChanges observed to accept that a gene in the
#'     treatment is differentially expressed in respect to the control.
#' @param pAdjustMethod method used to adjust the results; default: BH
#' @param pvalCutOff cutoff used, then a p-value adjustment is performed. If
#'     NULL all the reported p-values are for testing.
#' @param Minlog2FC minimum logarithm base 2 of fold changes.
#' @param saveAll if TRUE all the temporal results are returned.
#' @param num.cores The number of cores to use, i.e. at most how many child
#'     processes will be run simultaneously (see bpapply function from
#'     BiocParallel).
#' @param tasks integer(1). The number of tasks per job. value must be a scalar
#'     integer >= 0L. In this documentation a job is defined as a single call
#'     to a function, such as bplapply, bpmapply etc. A task is the division of
#'     the X argument into chunks. When tasks == 0 (default), X is divided as
#'     evenly as possible over the number of workers (see MulticoreParam from
#'     BiocParallel package).
#' @param verbose if TRUE, prints the function log to stdout
#' @importFrom betareg betareg
#' @importFrom BiocParallel MulticoreParam bplapply
#' @importFrom stats quasibinomial coefficients
#' @return The original GRanges object with the columns 'beta', 'log2FC',
#'     'pvalue', and TV added.
#' @examples
#' num.cyt <- 11001 # Number of cytosine position with methylation call
#' max.cyt = 14000
#' ## Gene annotation
#' genes <- GRanges(seqnames = '1',
#'                  ranges = IRanges(start = c(3631, 6788, 11649),
#'                                   end = c(5899, 9130, 13714)),
#'                  strand = c('+', '-', '-'))
#' mcols(genes) <- data.frame(gene_id = c('AT1G01010', 'AT1G01020',
#'                                        'AT1G01030'))
#'
#' set.seed(123) #'#' To set a seed for random number generation
#' ## GRanges object of the reference with methylation levels in
#' ## its meta-column
#' Ref <- makeGRangesFromDataFrame(
#'   data.frame(chr = '1',
#'              start = 3000:max.cyt,
#'              end = 3000:max.cyt,
#'              strand = '*',
#'              p1 = rbeta(num.cyt, shape1 = 1, shape2 = 1.5)),
#'   keep.extra.columns = TRUE)
#'
#' ## List of Granges objects of individuals methylation levels
#' Indiv <- GRangesList(
#'   sample11 = makeGRangesFromDataFrame(
#'     data.frame(chr = '1',
#'                start = 3000:max.cyt,
#'                end = 3000:max.cyt,
#'                strand = '*',
#'                p2 = rbeta(num.cyt, shape1 = 1.5, shape2 = 2)),
#'     keep.extra.columns = TRUE),
#'   sample12 = makeGRangesFromDataFrame(
#'     data.frame(chr = '1',
#'                start = 3000:max.cyt,
#'                end = 3000:max.cyt,
#'                strand = '*',
#'                p2 = rbeta(num.cyt, shape1 = 1.6, shape2 = 2.1)),
#'     keep.extra.columns = TRUE),
#'   sample21 = makeGRangesFromDataFrame(
#'     data.frame(chr = '1',
#'                start = 3000:max.cyt,
#'                end = 3000:max.cyt,
#'                strand = '*',
#'                p2 = rbeta(num.cyt, shape1 = 10, shape2 = 4)),
#'     keep.extra.columns = TRUE),
#'   sample22 = makeGRangesFromDataFrame(
#'     data.frame(chr = '1',
#'                start = 3000:max.cyt,
#'                end = 3000:max.cyt,
#'                strand = '*',
#'                p2 = rbeta(num.cyt, shape1 = 11, shape2 = 4)),
#'     keep.extra.columns = TRUE))
#' ## To estimate Hellinger divergence using only the methylation levels.
#' HD <- estimateDivergence(ref = Ref, indiv = Indiv, meth.level = TRUE,
#'                          columns = 1)
#' ## To perform the nonlinear regression analysis
#' nlms <- nonlinearFitDist(HD, column = 4, verbose = FALSE)
#'
#' ## Next, the potential signal can be estimated
#' PS <- getPotentialDIMP(LR = HD, nlms = nlms, div.col = 4, alpha = 0.05)
#'
#' ## The cutpoint estimation used to discriminate the signal from the noise
#' cutpoints <- estimateCutPoint(PS, control.names = c('sample11', 'sample12'),
#'                               treatment.names = c('sample21', 'sample22'),
#'                               div.col = 4, verbose = TRUE)
#' ## DIMPs are selected using the cupoints
#' DIMPs <- selectDIMP(PS, div.col = 4, cutpoint = min(cutpoints$cutpoint))
#'
#' ## Finally DIMPs statistics genes
#' p_DIMPs = getGRegionsStat(GR = DIMPs, grfeatures = genes, stat = 'mean',
#'                           prob = TRUE, column = 2L)
#'
#' GR_p_DIMP = uniqueGRanges(p_DIMPs, type = 'equal', chromosomes = '1')
#' colnames(mcols(GR_p_DIMP)) <-  c('sample11', 'sample12', 'sample21',
#'                                 'sample22')
#' names(GR_p_DIMP) <- genes$gene_id
#'
#' ## Group differences between methylation levels
#' propTest(GR = GR_p_DIMP, control.names = c('sample11', 'sample12'),
#'          treatment.names = c('sample21', 'sample22'))
#' @export
propTest <- function(GR, control.names, treatment.names, link = "logit", 
    type = "ML", tv.cut = NULL, indvPerGrp = 0, FilterLog2FC = TRUE, 
    pAdjustMethod = "BH", pvalCutOff = 0.05, Minlog2FC = 0.5, saveAll = FALSE, 
    num.cores = 1, tasks = 0L, verbose = TRUE) {
    if (!inherits(GR, "GRanges")) {
        stop("* 'GR' must be an object from class 'GRanges'")
    }
    GR <- GR[, c(control.names, treatment.names)]
    prop <- as.matrix(mcols(GR))
    g1 <- match(control.names, c(control.names, treatment.names))
    g2 <- match(treatment.names, c(control.names, treatment.names))
    idx <- (rowSums(prop[, g1] > 0) >= indvPerGrp | rowSums(prop[, 
        g2] > 0) >= indvPerGrp)
    GR <- GR[idx]
    prop <- prop[idx, ]
    GR$TV <- rowMeans(prop[, g2]) - rowMeans(prop[, g1])
    if (!is.null(tv.cut)) {
        idx.tv <- (abs(GR$TV) >= tv.cut)
        if (sum(idx.tv) > 0) {
            idx.tv <- which(idx.tv)
            prop <- prop[idx.tv, ]
        } else stop("*** tv.cut is higher than data TV values")
    }
    
    prop <- t(prop)
    group <- factor(c(rep(1, length(g1)), rep(2, length(g2))))
    group <- relevel(group, ref = "1")
    
    # Auxiliar function to perform beta regression
    betaReg <- function(x, grp, link, type) {
        r <- tryCatch(betareg(x ~ grp, link = link, type = type), 
            error = function(e) {
                n <- length(grp)
                x <- (x * (n - 1) + 0.5)/n
                try(betareg(x ~ grp, link = link, type = type), silent = TRUE)
            })
        if (inherits(r, "try-error")) {
            r <- try(glm(x ~ grp, family = quasibinomial(link = link)), 
                silent = TRUE)
        }
        
        if (!inherits(r, "try-error")) {
            beta <- unname(coef(r)[2])
            log2FC <- beta/log(2)
            if (inherits(r, "betareg")) {
                pvalue <- unname(coefficients(summary(r))$mean[2, 
                  4])
            } else pvalue <- coef(summary(r))[2, 4]
        } else {
            beta <- NA
            log2FC <- NA
            pvalue <- NA
        }
        return(c(beta = beta, log2FC = log2FC, pvalue = pvalue))
    }
    
    if (verbose) 
        cat("*** Performing beta-regression analysis ... \n")
    n <- ncol(prop)
    if (num.cores > 1) {
        bpparam <- MulticoreParam(workers = num.cores, tasks = tasks)
        res <- bplapply(1:n, function(k) {
            betaReg(x = prop[, k], grp = group, link = link, type = type)
        }, BPPARAM = bpparam)
        res <- do.call(rbind, res)
        res <- data.frame(res)
    } else {
        res <- matrix(NA, nrow = n, ncol = 3)
        for (k in 1:n) {
            res[k, ] <- betaReg(x = prop[, k], grp = group, link = link, 
                type = type)
        }
        res <- data.frame(res)
        colnames(res) <- c("beta", "log2FC", "pvalue")
    }
    
    if (all(is.na(res))) 
        stop("*** The regression model does not fit your data")
    
    if (saveAll && !is.null(tv.cut)) {
        l <- length(GR)
        GR$beta <- rep(NA, l)
        GR$beta[idx.tv] <- res$beta
        GR$log2FC <- rep(NA, l)
        GR$log2FC[idx.tv] <- res$log2FC
        GR$pvalue <- rep(NA, l)
        GR$pvalue[idx.tv] <- res$pvalue
    } else {
        if (!is.null(tv.cut)) {
            GR <- GR[idx.tv]
            mcols(GR) <- data.frame(GR@elementMetadata, res)
        }
        if (is.null(tv.cut)) 
            mcols(GR) <- data.frame(GR@elementMetadata, res)
    }
    
    if (!saveAll) 
        GR <- GR[!is.na(GR$log2FC)]
    if (FilterLog2FC && is.null(pvalCutOff) && !saveAll) {
        GR <- GR[abs(GR$log2FC) > Minlog2FC]
    }
    GR$adj.pval <- rep(NA, length(GR))
    if (!is.null(pvalCutOff) && !saveAll) {
        if (FilterLog2FC) 
            GR <- GR[abs(GR$log2FC) > Minlog2FC, ]
        GR$adj.pval <- p.adjust(GR$pvalue, method = pAdjustMethod)
        GR <- GR[GR$adj.pval < pvalCutOff, ]
    } else {
        if (!is.null(pvalCutOff) && saveAll && !FilterLog2FC) {
            idx <- which(!is.na(GR$adj.pval))
            GR$adj.pval[idx] <- p.adjust(GR$pvalue[idx], method = pAdjustMethod)
        }
        if (FilterLog2FC) {
            idx <- which(abs(GR$log2FC) > Minlog2FC)
            pval <- GR$pvalue[idx]
            GR$adj.pval[idx] <- p.adjust(pval, method = pAdjustMethod)
        } else {
            GR$adj.pval <- p.adjust(GR$pvalue, method = pAdjustMethod)
        }
    }
    return(GR)
}
genomaths/MethylIT.utils documentation built on July 4, 2023, 12:05 a.m.