R/wdists.R

Defines functions kl tobits reweight.densities error.integrals weightsofevidence Wdensities Wdensities.crude Wdensities.mix density.mixture summary.Wdensities mean.Wdensities auroc.crude auroc.model lambda.crude lambda.model prop.belowthreshold cumfreqs validate.densities validate.probabilities recalibrate.p

Documented in auroc.crude auroc.model lambda.crude lambda.model mean.Wdensities prop.belowthreshold recalibrate.p summary.Wdensities Wdensities Wdensities.crude Wdensities.mix weightsofevidence

##=============================================================================
##
## Copyright (c) 2018-2019 Paul McKeigue
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
##
##=============================================================================

#' Kullback-Leibler divergence of p from q
#'
#' @param p,q Probability distributions.
#'
#' @return
#' The KL divergence expressed in nats.
#'
#' @noRd
kl <- function(p, q) {
    kl <- p * log(p / q)
    kl[p == 0] <- 0
    kl <- mean(kl)
    return(kl)
}

#' Convert from natural log units to bits
#'
#' @param x Value expressed in natural log units (nats).
#'
#' @return
#' Value expressed in bits.
#'
#' @noRd
tobits <- function(x) {
  return(log2(exp(1)) * x)
}

#' @noRd
reweight.densities <- function(theta, fhat.ctrls, fhat.cases,
                               n.ctrls, n.cases, xseq, wts) {
    mean.ctrls <- sum(fhat.ctrls * xseq) / sum(fhat.ctrls)
    mean.cases <- sum(fhat.cases * xseq) / sum(fhat.cases)
    weights.ctrls <- n.ctrls * exp(-theta[1] * (xseq - mean.ctrls)^2)
    weights.cases <- n.cases * exp(+theta[1] * (xseq - mean.cases)^2)
    weights.ctrls <- weights.ctrls / sum(weights.ctrls)
    weights.cases <- weights.cases / sum(weights.cases)

    # average two estimates of geometric mean
    fhat.geomean <- wts[, 1] * fhat.ctrls * weights.ctrls * exp(+0.5 * xseq) +
                    wts[, 2] * fhat.cases * weights.cases * exp(-0.5 * xseq)

    ## compute case and control log densities
    f.ctrls <- fhat.geomean * exp(-0.5 * xseq)
    f.cases <- fhat.geomean * exp(+0.5 * xseq)
    return(data.frame(f.ctrls=f.ctrls, f.cases=f.cases))
}

#' Evaluate objective function
#' @noRd
error.integrals <- function(theta, densities, wts) {
    wdens <- with(densities, reweight.densities(theta, f.ctrls, f.cases,
                                                n.ctrls, n.cases, x, wts))
    ## objective function is abs(log(ratio of normalizing constants)
    obj <- abs(log(sum(wdens$f.ctrls / sum(wdens$f.cases))))
    return(obj)
}

#' Calculate weights of evidence in natural log units
#'
#' @param posterior.p Vector of posterior probabilities generated by using model
#'        to predict on test data.
#' @param prior.p Vector of prior probabilities.
#'
#' @return
#' The weight of evidence in nats for each observation.
#'
#' @examples
#' data(cleveland) # load example dataset
#' W <- with(cleveland, weightsofevidence(posterior.p, prior.p))
#'
#' @export
weightsofevidence <- function(posterior.p, prior.p) {
    validate.probabilities(posterior.p, prior.p)
    W <- (log(posterior.p) - log(1 - posterior.p) -
          log(prior.p / (1 - prior.p)))
    return(W)
}

#' Compute densities of weights of evidence in cases and controls
#'
#' The function computes smoothed densities of the weight of evidence in
#' cases and in controls from the crude probabilities, then adjusts them to
#' make them mathematically consistent so that p(W_ctrl) = exp(-W) p(W_case).
#'
#' If the sample distributions in cases and controls support a 2-component
#' mixture model (based on model comparison with BIC) for the densities, this
#' will be detected and a 2-component mixture model will be fitted before
#' adjustment.
#'
#' @param y Binary outcome label (0 for controls, 1 for cases).
#' @param posterior.p Vector of posterior probabilities generated by using model
#'        to predict on test data.
#' @param prior.p Vector of prior probabilities.
#' @param range.xseq Range of points where the curves should be sampled.
#' @param x.stepsize Distance between each point.
#' @param adjust.bw Bandwidth adjustment for the Gaussian kernel density
#'        estimator. By default it is set to 1 (no adjustment), setting it to
#'        a value smaller/larger than 1 reduces/increases the smoothing of
#'        the kernel. This argument is ignored if more than one mixture component
#'        is identified.
#' @param recalibrate If \code{TRUE} (the default) the weights of evidence are
#'        calculated after the posterior probabilities have been recalibrated
#'        against \code{y} using a logistic regression model.
#' @param debug If \code{TRUE}, the size of the adjustment is reported.
#'
#' @return
#' A densities object that contains the information necessary to compute
#' summary measures and generate plots.
#'
#' @examples
#' data(cleveland)
#' densities <- with(cleveland, Wdensities(y, posterior.p, prior.p))
#'
#' # Example which requires fitting a mixture distribution
#' data(fitonly)
#' densities <- with(fitonly, Wdensities(y, posterior.p, prior.p))
#'
#' @importFrom mclust mclustBIC Mclust
#' @importFrom stats optim
#' @export
Wdensities <- function(y, posterior.p, prior.p,
                       range.xseq=c(-25, 25), x.stepsize=0.01,
                       adjust.bw=1, recalibrate=TRUE, debug=FALSE) {
    n.ctrls <- sum(y == 0)
    n.cases <- sum(y == 1)
    if (n.ctrls + n.cases != length(y))
        stop("y contains values different from 0 or 1.")
    if (length(y) != length(posterior.p))
        stop("y and posterior.p must have the same number of observations.")
    if (x.stepsize < 0)
        stop("x.stepsize must be positive.")
    if (any(posterior.p == 0 | posterior.p == 1))
        stop("Posterior probabilities exactly 0 or 1.")

    posterior.p.recalib <- recalibrate.p(y, posterior.p)
    if(recalibrate) {# default is to use the recalibrated posterior
        posterior.p.used <- posterior.p.recalib
    } else {
        posterior.p.used <- posterior.p
    }

    W <- weightsofevidence(posterior.p.used, prior.p)
    xseq <- seq(range.xseq[1], range.xseq[2], by=x.stepsize)

    # determine number of mixture components by model comparison with BIC
    BIC.matrix.ctrls <- mclustBIC(data.frame(W[y==0]), G=1:2, verbose=FALSE)
    BIC.matrix.cases <- mclustBIC(data.frame(W[y==1]), G=1:2, verbose=FALSE)
    BIC.vector <- (BIC.matrix.ctrls + BIC.matrix.cases)[, 1]
    BIC.vector <- (BIC.vector - min(BIC.vector))
    if (debug) {
        cat("Controls:\n")
        print(BIC.matrix.ctrls)
        cat("\nCases:\n")
        print(BIC.matrix.cases)
        cat("\nBIC vector for controls + cases:\n")
        print(round(BIC.vector, 2))
    }
    num.components <- as.integer(which.max(BIC.vector))

    if (num.components > 1) {
        message("Density with ", num.components, " mixture components ",
                "chosen by BIC")
        mixmodel <- Mclust(W, G=num.components, verbose=FALSE)
        mixcomponent <- as.integer(mixmodel$classification)
        crude <- Wdensities.mix(y, W, xseq, mixcomponent)
    }
    else {
        crude <- Wdensities.crude(y, W, xseq, adjust.bw)
    }
    crude$x <- xseq
    crude$n.ctrls <- n.ctrls
    crude$n.cases <- n.cases

    wts <- cbind(exp(-0.5 * xseq) * n.ctrls,
                 exp(+0.5 * xseq) * n.cases)
    wts  <- wts / rowSums(wts) # normalize weights

    optim.result <- optim(par=0, fn=error.integrals, method="L-BFGS-B",
                          densities=crude, wts=wts,
                          lower=-0.5, upper=0.5)
    theta <- optim.result$par
    wdens <- reweight.densities(theta, crude$f.ctrls, crude$f.cases,
                                n.ctrls, n.cases,
                                xseq, wts)

    ## mean normalizing constant
    z <- 0.5 * (sum(wdens$f.ctrls) + sum(wdens$f.cases)) * x.stepsize
    f.cases <- wdens$f.cases / z
    f.ctrls <- wdens$f.ctrls / z
    if (debug) {
        cat("Optimal value of reweighting parameter theta:", theta, "\n")
        cat("f.cases normalizes to", sum(f.cases * x.stepsize), "\n")
        cat("f.ctrls normalizes to", sum(f.ctrls * x.stepsize), "\n")
    }

    ## cumulative frequencies for the adjusted distributions
    cumfreq.ctrls <- cumfreqs(f.ctrls, x.stepsize)
    cumfreq.cases <- cumfreqs(f.cases, x.stepsize)

    obj <- list(y=y, posterior.p=posterior.p, posterior.p.recalib=posterior.p.recalib,
                prior.p=prior.p,
                W=W, x=xseq, f.ctrls=f.ctrls, f.cases=f.cases,
                f.ctrls.crude=crude$f.ctrls, f.cases.crude=crude$f.cases,
                cumfreq.ctrls=cumfreq.ctrls, cumfreq.cases=cumfreq.cases,
                n.ctrls=n.ctrls, n.cases=n.cases, x.stepsize=x.stepsize)
    class(obj) <- "Wdensities"
    return(obj)
}

#' Calculate the crude smoothed densities of W in cases and in controls
#'
#' These functions allow to compute the smoothed densities of the weight of
#' evidence in cases and in controls from the crude probabilities.
#' \code{Wdensities.crude} is designed for the general case; if the model
#' probabilities reflect a spike-slab mixture distribution, where a high
#' proportion of values of the predictor are zero, then \code{Wdensities.mix}
#' will be more appropriate.
#'
#' @param y Binary outcome label (0 for controls, 1 for cases).
#' @param W Weight of evidence.
#' @param xseq Sequence of points where the curves should be sampled.
#' @param adjust.bw Bandwidth adjustment.
#'
#' @return
#' List of crude densities for controls and cases sampled at each point in the
#' range provided.
#'
#' @importFrom stats bw.SJ density
#' @keywords internal
Wdensities.crude <- function(y, W, xseq, adjust.bw=1) {
    W.ctrls <- W[y == 0]
    W.cases <- W[y == 1]
    bw.ctrls <- bw.SJ(W.ctrls) * adjust.bw
    bw.cases <- bw.SJ(W.cases) * adjust.bw
    fhat.ctrls.raw <- density(W.ctrls, bw.ctrls, n=length(xseq),
                              from=min(xseq), to=max(xseq))$y
    fhat.cases.raw <- density(W.cases, bw.cases, n=length(xseq),
                              from=min(xseq), to=max(xseq))$y
    return(list(f.ctrls=fhat.ctrls.raw, f.cases=fhat.cases.raw))
}

#' @rdname Wdensities.crude
#' @param mixcomponent integer vector same length as \code{y}, coded 1 or 2.
#'        Typically set by \code{Wdensities} which calls this function.
#'
#' @keywords internal
Wdensities.mix <- function(y, W, xseq, mixcomponent) {
    if (length(mixcomponent) != length(y))
        stop("Length of mixcomponent vector doesn't match length of y.")
    stopifnot(is.integer(mixcomponent))
    Wdensity.mix.ctrls <- density.mixture(W[y==0], mixcomponent[y==0], xseq)
    Wdensity.mix.cases <- density.mixture(W[y==1], mixcomponent[y==1], xseq)
    return(list(f.ctrls=Wdensity.mix.ctrls, f.cases=Wdensity.mix.cases))
}

#' Fit a 2-component mixture density over range of xseq
#' @importFrom stats density
#' @noRd
density.mixture <- function(W, mixcomponent, xseq) {
    num.xseq <- length(xseq)
    min.xseq <- min(xseq)
    max.xseq <- max(xseq)

    ## special case in which there is only one component in this subset or
    ## the number of observations for a component is not enough to compute
    ## the bandwidth of the kernel density estimate
    if (length(unique(mixcomponent)) == 1 || any(table(mixcomponent) < 2))
        return(density(W, bw="SJ", n=num.xseq, from=min.xseq, to=max.xseq)$y)

    ## fit a density for each mixture component
    density.1 <- density(W[mixcomponent == 1], bw="SJ",
                         n=num.xseq, from=min.xseq, to=max.xseq)
    density.2 <- density(W[mixcomponent == 2], bw="SJ",
                         n=num.xseq, from=min.xseq, to=max.xseq)

    ## sum these densities weighted by the mixture proportions
    wts.mix <- as.integer(table(mixcomponent)) / length(mixcomponent)
    density.mix <- wts.mix[1] * density.1$y + wts.mix[2] * density.2$y
    return(density.mix)
}

#' Summary evaluation of predictive performance
#'
#' @param object,x,densities Densities object produced by
#'        \code{\link{Wdensities}}.
#' @param \dots Further arguments passed to or from other methods. These are
#'        currently ignored.
#'
#' @return
#' \code{summary} returns a data frame that reports the number of cases and
#' controls, the test log-likelihood, the crude and model-based C-statistic
#' and expected weight of evidence Lambda.
#'
#' @examples
#' data(cleveland)
#' densities <- with(cleveland, Wdensities(y, posterior.p, prior.p))
#'
#' summary(densities)
#' mean(densities)
#' auroc.model(densities)
#' lambda.model(densities)
#'
#' @name summary-densities
#' @export
summary.Wdensities <- function(object, ...) {
    validate.densities(object)
    y <- object$y
    posterior.p <- object$posterior.p

    ## test log-likelihood
    loglik <- y * log(posterior.p) + (1 - y) * log(1 - posterior.p)

    posterior.p.recalib <- object$posterior.p.recalib
    loglik.recalib <-
            y * log(posterior.p.recalib) + (1 - y) * log(1 - posterior.p.recalib)

    results <- data.frame(casectrl=paste(object$n.cases, "/", object$n.ctrls),
                          auroc=round(auroc.crude(object), 3),
                          auroc.adj=round(auroc.model(object), 3),
                          lambda=round(lambda.crude(object), 2),
                          lambda.adj=round(lambda.model(object), 2),
                          test.loglik=round(sum(loglik), 2),
                          test.loglik.recalib=round(sum(loglik.recalib), 2)
                          )
    Lambda.char <- intToUtf8(923)
    names(results) <-
        c("Cases / controls",
          "Crude C-statistic",
          "Model-based C-statistic",
          paste("Crude", Lambda.char, "(bits)"),
          paste("Model-based", Lambda.char, "(bits)"),
          "Test log-likelihood (nats)",
          "log-likelihood after recalibration (nats)"
          )
    return(results)
}

#' @rdname summary-densities
#' @return
#' \code{mean} returns a numeric vector listing the mean densities of the weight
#' of evidence in controls and in cases.
#'
#' @export
mean.Wdensities <- function(x, ...) {
    validate.densities((densities <- x))
    means.ctrls <- sum(densities$x * densities$f.ctrls) / sum(densities$f.ctrls)
    means.cases <- sum(densities$x * densities$f.cases) / sum(densities$f.cases)
    return(c(ctrls=-means.ctrls, cases=means.cases))
}

#' @rdname summary-densities
#' @importFrom pROC auc
#' @export
auroc.crude <- function(densities) {

    ## force direction of computation of the C-statistic so that predicted
    ## values for controls are lower or equal than values for cases
    auroc <- as.numeric(with(densities, auc(y, posterior.p, direction="<",
                                            quiet=TRUE)))
    return(auroc)
}

#' @rdname summary-densities
#' @return
#' \code{auroc.crude} and \code{auroc.model} return the area under the ROC curve
#' according to the crude and the model-based densities of weight of evidence,
#' respectively.
#'
#' @importFrom zoo rollmean
#' @export
auroc.model <- function(densities) {
    roc.model <- data.frame(x=1 - densities$cumfreq.ctrls,
                            y=1 - densities$cumfreq.cases)
    auroc.model <- -sum(diff(roc.model$x) * rollmean(roc.model$y, 2))
    return(auroc.model)
}

#' @rdname summary-densities
#' @export
lambda.crude <- function(densities) {
    W <- with(densities, weightsofevidence(posterior.p, prior.p))
    W.case <- W[densities$y == 1]
    W.ctrl <- W[densities$y == 0]
    lambda <- tobits(mean(c(mean(W.case), -mean(W.ctrl))))
    return(lambda)
}

#' @rdname summary-densities
#' @return
#' \code{lambda.crude} and \code{lambda.model} return the expected weight of
#' evidence (expected information for discrimination) in bits from the crude
#' and the model-based densities, respectively.
#'
#' @export
lambda.model <- function(densities) {
    wts.ctrlscases <- with(densities, c(n.ctrls, n.cases) / (n.ctrls + n.cases))
    lambda <- tobits(sum(wts.ctrlscases * mean(densities)))
    return(lambda)
}

#' Proportions of cases and controls below a threshold of weight of evidence
#'
#' @param densities Densities object produced by \code{\link{Wdensities}}.
#' @param w.threshold Threshold value of weight of evidence (natural logs).
#'
#' @return
#' Numeric vector of length 2 listing the proportions of controls and
#' cases with weight of evidence below the given threshold.
#'
#' @examples
#' data(cleveland)
#' densities <- with(cleveland, Wdensities(y, posterior.p, prior.p))
#' w.threshold <- log(4) # threshold Bayes factor of 4
#' prop.belowthreshold(densities, w.threshold)
#' 
#' @export
prop.belowthreshold <- function(densities, w.threshold) {
    validate.densities(densities)
    xseq.threshold.idx <- which(densities$x >= w.threshold)[1]
    prop.ctrls <- round(densities$cumfreq.ctrls[xseq.threshold.idx], 3)
    prop.cases <- round(densities$cumfreq.cases[xseq.threshold.idx], 3)
    return(c(ctrls=prop.ctrls, cases=prop.cases))
}

#' Cumulative frequency distribution
#'
#' @noRd
cumfreqs <- function(f, x.stepsize) {
    ## normalize f
    f <- f / sum(f * x.stepsize)
    return(cumsum(f * x.stepsize))
}

#' Validate densities object
#'
#' Checks that the object has been created by \code{\link{Wdensities}}.
#'
#' @param obj An object to be checked.
#'
#' @return
#' Throws an error if the object is not a densities object.
#'
#' @noRd
validate.densities <- function(obj) {
    if (class(obj) != "Wdensities") {
        stop("Not a densities object of class 'Wdensities'.")
    }
}

#' Validate probability vectors
#'
#' Checks that the input vectors are numeric, don't have any NAs, and contain
#' values that are probabilities.
#'
#' @param posterior,prior Vectors of probabilities.
#'
#' @return
#' Throws an error if the vectors do not contain probabilities.
#'
#' @noRd
validate.probabilities <- function(posterior, prior) {
    stopifnot(is.numeric(posterior), is.numeric(prior))
    if (length(prior) > 1 && length(prior) != length(posterior))
        stop("Wrong length for the vector of prior probabilities.")
    if (anyNA(posterior) || anyNA(prior))
        stop("NAs not allowed in the vectors of probabilities.")
    if (any(posterior < 0 | posterior > 1 | prior < 0 | prior > 1))
        stop("Vector does not contain probabilities.")
    if (any(posterior == 0 | posterior == 1))
        stop("Posterior probabilities exactly 0 or 1.")
}

#' Recalibrate posterior probabilities
#'
#' Transforms posterior probabilities to logits, fits a logistic regression model
#' and returns the predictive probabilities from this model.
#'
#' @param y Binary outcome label (0 for controls, 1 for cases).
#' @param posterior.p Vector of posterior probabilities.
#' 
#' @return
#' Recalibrated posterior probabilities.
#' @importFrom stats glm predict
recalibrate.p <- function(y, posterior.p) {
    x <- log(posterior.p / (1 - posterior.p))
    glm.model <- glm(y ~ x, family="binomial")
    newp <- predict(object=glm.model, type="response")
    return(newp)
}

Try the wevid package in your browser

Any scripts or data that you put into this service are public.

wevid documentation built on Sept. 12, 2019, 5:04 p.m.