# R/gammaMixtCut.R In genomaths/MethylIT.utils: MethylIT utility

#### Documented in gammaMixtCut

#' @rdname gammaMixtCut
#' @title Cutpoint estimation based on Mixtures of Gamma Distributions
#' @description This functions estimates cutpoint value to classify DMPs into
#'     two classes: 1) from treatment and 2) from control, based on Mixtures of
#'     Gamma Distributions. The cutpoint estimations are limited to the analysis
#'     of mixture distributions of the form:
#'     \eqn{F(x) = \lambda G(x) + (1 - \lambda) H(x)}, where
#'     \eqn{\lambda \in [0, 1]}, \eqn{G(x)} and \eqn{F(x)} are the gamma
#'     cummulative distribution functions distributions followed by the
#'     information divergences estimated for individuals from control and
#'     treatment populations, respectively.
#' @details After the estimation of potential DMPs, the pool of DMPs from
#'     control and treatment is assumed that follows mixtures of Gamma
#'     distributions corresponding to two populations. A posterior probability
#'     2d-vector is estimated for each DMP. The cutpoint is determined from the
#'     intersection of the two gamma probabilities density distributions. That
#'     is, \eqn{f(x) and g(x)} are the estimatied densities for control and
#'     treatment groups, repectively, then the cutpoint is the values of x for
#'     which \eqn{f(x) = g(x)}.
#'
#'     The Mixtures of Gamma Distributions (MGD) is estimated by using function
#'     \code{\link[mixtools]{gammamixEM}} from package *mixtools*. By default
#'     function \code{\link[mixtools]{gammamixEM}} produces returns a long list
#'     including the posterior probability to belong to the treatment. Here, by
#'     the sake of brevety only the information on the fitted model is given.
#'     The posterior model probability can be retrieved by using *predict*
#'     function. Accordign with MGD model, DMPs with a posterior probability to
#'     belong to the treatment group greater than *post.cut = 0.5* is classified
#'     as *DMP from treatment*. The post.cut can be modified. For all the cases
#'     \eqn{0 < post.cut < 1}. The cutpoint and, hence, the classification
#'     derived throught MGD model might differ from that provided throught
#'     about the DMP and, therefore, reports better performance. The
#'     classification perfomance reported when *clas.perf* = TRUE or *find.cut*
#'     = TRUE is created with function \code{\link{evaluateDIMPclass}} for the
#'     especified matchin learning model.
#'
#'     If parameter *find.cut = TRUE*, then a search for the best
#'     cutpoint in a predifined inteval (*cut.interval*) is performed calling
#' @param LR A "pDMP"or "InfDiv" object obtained with functions
#'     \code{\link[MethylIT]{estimateDivergence}}. These are list of GRanges
#'     objects, where each GRanges object from the list must have at least
#'     two columns: a column containing the total variation of methylation
#'     level (TV, difference of methylation levels) and a column containing a
#'     divergence of methylation levels (it could be TV or  Hellinger
#'     divergence).
#' @param post.cut Posterior probability to dicide whether a DMPs belong to
#'     treatment group. Default *post.cut* = 0.5.
#' @param tv.col Column number where the total variation is located in the
#'     metadata from each GRanges object.
#' @param div.col Column number for divergence of methylation levels used in the
#'     estimation of the cutpoints. Default: 9L (hdiv column from an InfDiv
#'     object).
#' @param tv.cut A cutoff for the total variation distance to be applied to each
#'     site/range. Only sites/ranges *k* with \eqn{TVD_{k} > tv.cut} are
#'     are used in the analysis. Its value must be a number
#'     \eqn{0 < tv.cut < 1}. Default is \eqn{tv.cut = 0.25}.
#' @param control.names,treatment.names Optional. Names/IDs of the control and
#'     treatment samples, which must be include in the variable LR
#'     (default, NULL). However, these are required if any of the parameters
#'     *find.cut* or *clas.perf* are set TRUE.
#' @param treatment.names Optional. Names/IDs of the treatment samples, which
#'     must be include in the variable LR (default, NULL).
#' @param column a logical vector for column names for the predictor variables
#'     to be used: Hellinger divergence "hdiv", total variation "TV",
#'     probability of potential DIMP "wprob", and the relative cytosine site
#'     position "pos" in respect to the chromosome where it is located. The
#'     relative position is estimated as (x - x.min)/(x.max - x), where x.min
#'     and x.max are the maximum and minimum for the corresponding chromosome,
#'     repectively. If "wprob = TRUE", then Logarithm base-10 of "wprob" will
#'     be used as predictor in place of "wprob" ( see
#' @param find.cut Logic. Wether to search for an optimal cutoff value to
#'     classify DMPs based on given specifications.
#' @param classifier Classification model to use. Option "logistic" applies a
#'     logistic regression model; option "lda" applies a Linear Discriminant
#'     Analysis (LDA); "qda" applies a Quadratic Discriminant Analysis (QDA),
#'     "pca.logistic" applies logistic regression model using the Principal
#'     Component (PCs) estimated with Principal Component Analysis (PCA) as
#'     predictor variables. pca.lda" applies LDA using PCs as predictor
#'     variables, and the option "pca.qda" applies a Quadratic Discriminant
#'     Analysis (QDA) using PCs as predictor variables. 'SVM' applies Support
#'     Vector Machines classifier from R package e1071.
#' @param prop Proportion to split the dataset used in the logistic regression:
#'     group versus divergence (at DIMPs) into two subsets, training and
#'     testing.
#' @param clas.perf Logic. Whether to return the classificaiton performance for
#'     the estimated cutpoint. Default, FALSE.
#' @param cut.interval 0 < *cut.interval* < 0.1. If *find.cut*= TRUE, the
#'     interval of treatment group posterior probabilities where to search for a
#'     cutpoint. Deafult *cut.interval* = c(0.5, 0.8).
#' @param cut.incr 0 < *cut.incr* < 0.1. If *find.cut*= TRUE, the sucesive
#'     increamental values runing on the interval *cut.interval*. Deafult,
#'     *cut.incr* = 0.01.
#' @param num.cores,tasks Paramaters for parallele computation using package
#'     \code{\link[BiocParallel]{BiocParallel-package}}: the number of cores to
#'     use, i.e. at most how many child processes will be run simultaneously
#'     (only for Linux OS).
#' @param stat An integer number indicating the statistic to be used in the
#'     testing when *find.cut* = TRUE. The mapping for statistic names are:
#'     0 = "Accuracy", 1 = "Sensitivity", 2 = "Specificity",
#'     3 = "Pos Pred Value", 4 = "Neg Pred Value", 5 = "Precision",
#'     6 = "Recall", 7 = "F1",  8 = "Prevalence", 9 = "Detection Rate",
#'     10 = "Detection Prevalence", 11 = "Balanced Accuracy", 12 = FDR.
#' @param maximize Whether to maximize the performance indicator given in
#'     parameter 'stat'. Default: TRUE.
#' @param ... Additional arameters to pass to functions
#' @importFrom mclust Mclust
#' @importFrom mixtools gammamixEM
#' @importFrom stats uniroot dgamma
#' @importFrom MethylIT selectDIMP unlist validateClass
#' @export

gammaMixtCut <- function(LR, post.cut = 0.5, div.col=NULL, tv.col=NULL,
tv.cut=0.25, find.cut=FALSE,
control.names=NULL, treatment.names=NULL,
column=c(hdiv=FALSE, TV=FALSE, wprob=FALSE, pos=FALSE),
classifier=c("logistic", "pca.logistic", "lda",
"svm", "qda","pca.lda", "pca.qda"),
prop=0.6, clas.perf = FALSE, cut.interval = c(0.5, 0.8),
cut.incr = 0.01, stat = 1, maximize = TRUE, num.cores=1L,
tasks=0L, tol = .Machine$double.eps^0.5, maxiter = 1000, ...) { # -------------------------- valid "pDMP" object--------------------------- # validateClass(LR) # ---------------------------------------------------=--------------------- # if ((find.cut || clas.perf) && is.null(div.col)) stop("If findcut or clas.perf is TRUE, a div.col value must be provided") cl <- class(LR) divs = unlist(LR) div <- abs(mcols(divs[, div.col])[, 1]) # To remove divs == 0. The methylation signal only is given for divs > 0 divs = divs[ div > 0 ] # ============= Obtain a prior classification model =========== fit <- Mclust(div, G=2, model="V", prior = priorControl()) alpha <- fit$parameters$mean^2/fit$parameters$variance$sigmasq
beta <- fit$parameters$variance$sigmasq/fit$parameters$mean pro <- fit$parameters$pro # ================= Fit Gamma mixture ========================== y1 <- gammamixEM(div, lambda = pro, alpha = alpha, beta = beta, verb = FALSE) # === Optimal cutpoint from the intersection of gamma distributios === idx <- y1$posterior[,1] > post.cut
par1 <- y1$gamma.pars[,1] par2 <- y1$gamma.pars[,2]
lower <- max(min(y1$x[idx]), min(y1$x[-idx]))
upper <- min(max(y1$x[idx]), max(y1$x[-idx]))
zerofun <- function(x) {
dgamma(x, shape = par1, scale = par1) -
dgamma(x, shape = par2, scale = par2)
}

zero <- try(uniroot(zerofun, interval = c(lower, upper),
tol = .Machine$double.eps^0.5, maxiter = 1000), silent = TRUE) if (!inherits(zero, "try-error")) zero <- unlist(zero) else zero <- rep(NA, 5) names(zero) <- c("cutpoint", "error", "iter", "init.it", "estim.prec") if (cl != "pDMP") { alpha <- c(par1, par2) beta <- c(par1, par2) LR <- lapply(LR, function(x) { div <- abs(mcols(x[, div.col])[, 1]) x$wprob <- pmixgamma(div, pi, alpha = alpha, beta = beta, ...)
return(x)
})
class(LR) <- "pDMP"
divs
}

# Auxiliar function to find cutpoint/intersection point of the two gamma
# distributions
cutFun <- function(post.cut) {
idx <- which(y1$posterior[, 2] > post.cut) TT <- div[idx] CT <- div[-idx] return(max(c(max(CT),min(TT)))) } # -------------------------------------------------------------------- # if (find.cut) { cuts <- seq(cut.interval, cut.interval, cut.incr) k = 1; opt <- FALSE if (stat == 12) st = 1 else st = 0 k = 1; opt <- FALSE; cutprob <- cuts while (k < length(cuts) && !opt) { cutpoint <- cutFun(cuts[k]) dmps <- selectDIMP(LR, div.col = div.col, cutpoint = cutpoint, tv.col=tv.col, tv.cut=tv.cut) conf.mat <- evaluateDIMPclass(dmps, column = column, control.names = control.names, treatment.names = treatment.names, classifier=classifier, prop=prop, output = "conf.mat", num.cores=num.cores, tasks=tasks, verbose = FALSE) if (stat == 0) { st0 <- conf.mat$Performance$overall if (st < st0) { st <- st0 cutprob <- cuts[k] } if (st == 1) opt <- TRUE k <- k + 1 } if (is.element(stat, 1:11)) { st0 <- conf.mat$Performance$byClass[stat] if (st < st0) { st <- st0 cutprob <- cuts[k] } if (st == 1) opt <- TRUE k <- k + 1 } if (stat == 12) { st0 <- conf.mat$FDR
if (st > st0) {
st <- st0
cutprob <- cuts[k]
}
if (st == 0) opt <- TRUE
k <- k + 1
}
}
conf.mat <- c(Cutpoint=cutFun(cuts[k]), PostProbCut = cuts[k], conf.mat)
}
# -------------------------------------------------------------------- #

if (clas.perf && !find.cut) {
dmps <- selectDIMP(LR, div.col = div.col, cutpoint = zero,
tv.col=tv.col, tv.cut=tv.cut)
conf.mat <- evaluateDIMPclass(dmps, column = column,
control.names = control.names,
treatment.names = treatment.names,
classifier=classifier, prop=prop,
output = "conf.mat", num.cores=num.cores,
}
# -------------------------------------------------------------------- #
y <- structure(list(gammaPars=y1$gamma.pars, lambda=y1$lambda,
loglik=y1\$loglik), class="GammaMixt"); rm(y1); gc()
if (clas.perf && !find.cut) {
res <- list(gammaMixtureCut=zero, conf.mat = conf.mat, gammaMixture = y)
cat("\n")
cat("Cutpoint estimation with Mixtures of Gamma Distributions \n")
cat("\n")
cat("Cutpoint =", zero, "\n")
cat("\n")
cat("The accessible objects in the output list are: \n")
print(summary(res))
}
if (find.cut) {
res <- list(gammaMixtureCut=zero, conf.mat = conf.mat, gammaMixture = y)
STAT <- c("Accuracy", "Sensitivity", "Specificity", "Pos Pred Value",
"Neg Pred Value","Precision", "Recall", "F1", "Prevalence",
"Detection Rate", "Detection Prevalence", "Balanced Accuracy",
"FDR")
cat("\n")
cat("Cutpoint estimation with Mixtures of Gamma Distributions \n")
cat("\n")
cat("Cutpoint =", zero, "\n")
cat("\n")
cat("Cutpoint search performed using model posterior probabilities \n")
cat("\n")
cat("Optimized statistic:", STAT[stat + 1], "=", st, "\n")
cat("Cutpoint =", cutpoint, "\n")
cat("PostProbCut =", cuts[k], "\n")
cat("\n")
cat("Cytosine sites with treatment PostProbCut >=", cuts[k], "have a \n")
cat("divergence value >=", cutpoint, "\n")
cat("\n")
cat("The accessible objects in the output list are: \n")
print(summary(res))
}
if (!clas.perf && !find.cut) {
res <- list(gammaMixtureCut=zero, gammaMixture=y)
cat("\n")
cat("Cutpoint estimation with Mixtures of Gamma Distributions \n")
cat("\n")
cat("Cutpoint =", zero, "\n")
cat("\n")
cat("The accessible objects in the output list are: \n")
print(summary(res))
}
invisible(res)
}

genomaths/MethylIT.utils documentation built on Nov. 26, 2019, 2:10 a.m.