R/mixGaussianFit.r

Defines functions plot.mgfit predict.mgfit mixGaussianFit getPeaks getPeaks3

# Functionsto fit mixtures of Gaussians All the functions below are internal to
# the package

# Find the modes for a multimodal distribution This function returns the modes
# of a multimodal distribution
# @param x Numeric vector
# @param adj Number indicating the adjust parameter for bandwidth of the
# density estimation
# @param thr Threshold for lambda, distrivutions with lambda below this
# threshold will be discarded @return list of three elements: mean, sd and
# lambda
getPeaks3 <- function(x, adj = 1.2, thr = 0.01) {
    den <- density(x, adj = adj, na.rm = TRUE, n = 512 * 50)
    sp <- smooth.spline(den)
    sp2 <- predict(sp, den$x, deriv = 2)$y
    pos <- which(sp2[-length(sp2)] * sp2[-1] < 0)
    x2 <- (den$x[pos] + den$x[pos + 1])/2
    sp3 <- predict(sp, x2, deriv = 3)$y
    posi <- lapply(which(sp3 < 0), function(i, x2, posi) {
        if (length(x2) < (i + 1))
            return(NULL)
        pos <- which(posi > x2[i] & posi < x2[i + 1])
        if (length(pos) == 0)
            return((x2[i + 1] + x2[i])/2)
        return(posi[pos[1]])
    }, x2 = x2, posi = getPeaks(x, adj = adj))
    posi <- unlist(posi[vapply(posi, length, numeric(1)) > 0],
                   use.names = FALSE)
    posi1 <- lapply(which(sp3 > 0), function(i, x2) {
        if (length(x2) < (i + 1))
            return(NULL)
        (x2[i] + x2[i + 1])/2
    }, x2 = x2)
    posi1 <- unlist(posi1[vapply(posi1, length, numeric(1)) > 0],
                    use.names = FALSE)
    posi <- vapply(posi, function(posi, x) which(x > posi)[1], numeric(1),
                   x = den$x)
    posi1 <- vapply(posi1, function(posi, x) which(x > posi)[1], numeric(1),
                    x = den$x)
    m <- sort(den$x[posi])
    if (length(posi1) == 0) {
        b <- c(min(den$x) - diff(range(den$x)), max(den$x) + diff(range(den$x)))
    } else {
        b <- c(min(den$x) - diff(range(den$x)), sort(den$x[posi1]),
                max(den$x) + diff(range(den$x)))
    }
    sigma <- vapply(m, function(x, den, b) {
        ymax <- approx(den, xout = x)$y
        x1 <- den$x[den$y < (ymax/2)]
        dd <- x1 - x
        dd1 <- b - x
        x2 <- c(x1[dd < 0][which.max(dd[dd < 0])],
            x1[dd > 0][which.min(dd[dd > 0])])
        x3 <- c(b[dd1 < 0][which.max(dd1[dd1 < 0])], b[dd1 > 0]
            [which.min(dd1[dd1 > 0])])
        dd2 <- x2 - x
        dd3 <- x3 - x
        opt <- c(dd2[1] - dd3[1], dd3[2] - dd2[2])
        pos <- which(opt > 0)
        if (length(pos) > 0) {
            tmp <- dd2[pos][which.min(abs(dd2[pos]))]
        } else {
            tmp <- dd3[which.min((dd2 - dd3)^2)]
        }
        res <- -3.934e-06 + 0.8493 * abs(tmp)
        if (length(res) == 0)
            res <- NA
        res
    }, numeric(1), den = den, b = b)
    lambda <- vapply(seq_len(length(m)), function(i, m, sigma, den) {
        if (is.na(sigma[i]))
            return(NA)
        x2 <- seq(m[i] - sigma[i], m[i] + sigma[i], length = 100)
        integrateTZ(x2, approx(den, xout = x2)$y)
    }, numeric(1), m = m, sigma = sigma, den = den)
    lambda[is.na(lambda)] <- 0
    lambda <- lambda/sum(lambda)
    pos <- which(lambda > thr)
    list(m = m[pos], sigma = sigma[pos], lambda = lambda[pos])
}

# Find the modes for a multimodal distribution This function returns the modes
# of a multimodal distribution @param x Numeric vector
# @param adj Number
# indicating the adjust parameter for bandwidth of the density estimation
# @return Numeric
# vector of modes
getPeaks <- function(x, adj = 1.5) {
    den <- density(x, adj = adj, na.rm = TRUE, n = 512 * 50)
    posi <- which(c(FALSE, diff(diff(den$y) > 0) < 0, FALSE))
    posi <- posi[order(den$y[posi], decreasing = TRUE)]
    sort(den$x[posi])
}

# Fit a mixture of gaussian curves This function fits a mixture of gaussians to
# the distribution of any population
# @param x Numeric vectors @param thr Minimum lambda (proportion of the
# distribution) to include in the analysis
# @param min Optional minimum number of distributions
# @param max Optional maximum number of distributions
# @param adj Optional vector of 2 components defining the range of
# values for the density adj parameter @return Object of class mgfit. List of
# fitted parameters
mixGaussianFit <- function(x, thr = 0.01, min = 1, max = 1e+06, adj = c(0.8, 2))
{
    x <- x[is.finite(x)]
    fit <- lapply(seq(min(adj), max(adj), length = 10), function(adj, x, thr,
        den, min, max) {
        param <- getPeaks3(x, adj = adj, thr = thr)
        param <- lapply(param, function(x) c(x, 0))
        if ((length(param$m) - 1) < min | (length(param$m) - 1) > max)
            return(NULL)
        while (any(param$lambda < thr)) {
            pos <- which(param$lambda < thr)
            param <- lapply(param, function(x, pos) x[-pos], pos = pos)
            suppressMessages(invisible(capture.output(fit <- normalmixEM(x, lambda = param$lambda,
                mu = param$m, sd = param$sigma, epsilon = 1e-50))))
            ye <- vapply(seq_len(length(fit$lambda)), function(i, x, fit) {
                dnorm(x, fit$mu[i], fit$sigma[i]) * fit$lambda[i]
            }, numeric(length(den$x)), x = den$x, fit = fit)
            if (is.null(dim(ye)))
                ye <- matrix(ye, length(ye), 1)
            mse <- mean((rowSums(ye) - den$y)^2)
            param <- list(mu = fit$mu, sigma = fit$sigma, lambda = fit$lambda)
        }
        list(mu = fit$mu, sigma = fit$sigma, lambda = fit$lambda,
            loglik = fit$loglik, mse = mse)
    }, x = x, thr = thr, den = density(x, n = 512 * 20), min = min, max = max)
    fit <- fit[vapply(fit, length, numeric(1)) > 0]
    if (length(fit) == 0)
        stop("No sucessful filt with provided parameters", call. = FALSE)
    mse <- vapply(fit, function(x) x$mse, numeric(1))
    mse[!is.finite(mse)] <- 1e+06
    res <- fit[[which.min(mse)]]
    class(res) <- "mgfit"
    return(res)
}

# Predict relative likelihood for mixGaussianFit This function computes the
# relative likelihood based on parameters fitted with mixGaussianFit function
# @param fit Object generated by mixGaussianFit function
# @param x Numerical vector of observations
# @param k Optional vector of integers indicating the distributions to use
# @return Matrix of relative likelihood, observations in rows and distributions
# in columns
predict.mgfit <- function(fit, x, k = NULL) {
    # fit should have at least 2 gaussians
    if (length(fit$mu) < 2)
        stop("Mixture of at least 2 gaussians is required", call. = FALSE)
    # If k is not specified then fit for all gaussians
    if (is.null(k))
        k <- seq_len(length(fit$mu))
    # Ensure 1 <= k <= gaussians
    k <- k[k %in% seq_len(length(fit$mu))]
    # Check there are still usable k
    if (length(k) == 0)
        stop("Selected gaussians not present in the fit object", call. = FALSE)
    prob_density <- vapply(seq_len(length(fit$mu)), function(i, x, fit)
        dnorm(x, fit$mu[i], fit$sigma[i]), numeric(length(x)), x = x, fit = fit)
    prob_density <- prob_density/rowSums(prob_density)
    colnames(prob_density) <- seq_len(ncol(prob_density))
    return(prob_density[, k, drop = FALSE])
}

# Plot mixGaissuinFit objects Plot a mixture of distributions fitted with
# mixGaussianFit
# @param fit Object generated by the function mixGaussianFit
# @param x Vector of values to plot the distribution
# @param col Color for the background distribution
# @param lwd Line weight for the fitted distributions
# @param fitCol Optional vector of colors for the fitted distributions
# @param main Character string for the main title
# @param ... Additional parameters to pass to the plot function
# @return Nothing, a plot is generated
plot.mgfit <- function(fit, x, col = "grey75", lwd = 2, fitCol = NULL,
    main = "", ylim = NULL, ...) {
    if (is.null(fitCol))
        fitCol <- rainbow(length(fit$mu), 0.8, 0.8, end = 0.8)
    den <- density(x, from = min(x, na.rm = TRUE), to = max(x, na.rm = TRUE),
        na.rm = TRUE)
    denmax <- 0
    for (i in seq_len(length(fit$mu))) denmax <- max(denmax, dnorm(den$x,
        fit$mu[i], fit$sigma[i]) * fit$lambda[i])
    denmax <- max(denmax, den$y)
    if (is.null(ylim))
        ylim <- c(0, denmax)
    plot(den, type = "n", axes = FALSE, main = main, ylim = ylim, ...)
    axis(1)
    axis(2)
    polygon(c(min(den$x), den$x, max(den$x)), c(0, den$y, 0), col = col,
        border = NA)
    for (i in seq_len(length(fit$mu))) lines(den$x, dnorm(den$x, fit$mu[i],
        fit$sigma[i]) * fit$lambda[i], lwd = lwd, col = fitCol[i])
}
reef103/genomicInstability documentation built on Oct. 10, 2022, 12:33 a.m.