R/wdists.R

##=============================================================================
##
## Copyright (c) 2018 Paul McKeigue
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
##
##=============================================================================

#' Gaussian kernel for density estimation
#'
#' @param x scalar.
#' @param X vector.
#' @param h Bandwidth size.
#' @param n Length of vector \var{X}.
#'
#' @keywords internal
fsmooth <- function(x, X, h, n) {
    ## density at x is weighted average of observed values
    ## with weights scaling with exp(-t^2)
    return(sum(dnorm((x - X) / h))/ (n * h))
}

#' Kullback-Leibler divergence of p from q
#'
#' @param p Probability distribution.
#' @param q Probability distribution.
#' @return The KL divergence expressed in nats.
#'
#' @keywords internal
kl <- function(p, q) {
    kl <- p * log( p / q)
    kl[p==0] <- 0
    kl <- mean(kl)
    return(kl)
}

#' Convert from natural log units to bits
#'
#' @param x Value expressed in natural log units (nats).
#' @return Value expressed in bits.
#'
#' @keywords internal
tobits <- function(x) {
  return(log2(exp(1)) * x)
}

#' @keywords internal
reweight.densities <- function(theta, fhat.ctrls, fhat.cases,
                               n.ctrls, n.cases, xseq, wts) {
    mean.ctrls <- sum(fhat.ctrls * xseq) / sum(fhat.ctrls)
    mean.cases <- sum(fhat.cases * xseq) / sum(fhat.cases)
    weights.ctrls <- n.ctrls * exp(-theta[1] * (xseq - mean.ctrls)^2)
    weights.cases <- n.cases * exp(+theta[1] * (xseq - mean.cases)^2)
    weights.ctrls <- weights.ctrls / sum(weights.ctrls)
    weights.cases <- weights.cases / sum(weights.cases)

    # average two estimates of geometric mean
    fhat.geomean <- wts[, 1] * fhat.ctrls * weights.ctrls * exp(+0.5 * xseq) +
                    wts[, 2] * fhat.cases * weights.cases * exp(-0.5 * xseq)

    ## compute case and control log densities
    f.ctrls <- fhat.geomean * exp(-0.5 * xseq)
    f.cases <- fhat.geomean * exp(+0.5 * xseq)
    return(data.frame(f.ctrls=f.ctrls, f.cases=f.cases))
}

#' Evaluate objective function
#' @keywords internal
error.integrals <- function(theta, densities, wts) {
    wdens <- with(densities, reweight.densities(theta, f.ctrls, f.cases,
                                                n.ctrls, n.cases, x, wts))
    ## objective function is abs(log(ratio of normalizing constants)
    obj <- abs(log(sum(wdens$f.ctrls / sum(wdens$f.cases))))
    return(obj)
}

#' Summary evaluation of predictive performance
#'
#' @param studyname Name of the study.
#' @param y Binary outcome label (0 for controls, 1 for cases).
#' @param posterior.p Vector of posterior probabilities generated by a logistic
#'        regression model.
#' @param prior.p Vector of prior probabilities.
#' @return A dataframe listing some metrics of predictive performance.
#'
#' @importFrom pROC auc
#'
#' @examples
#' data("cleveland") # load example dataset
#' with(cleveland,
#'      wtrue.results("cleveland", y, posterior.p, prior.p))
#' 
#' @export
#' 
wtrue.results <- function(studyname, y, posterior.p, prior.p) {
    ## force direction of computation of the C-statistic so that predicted
    ## values for controls are lower or equal than values for cases
    auroc <- auc(y, posterior.p, direction="<")

    ## weight of evidence in favour of true status
    loglikrat <- (2 * y - 1) * weightsofevidence(posterior.p, prior.p)
    loglikrat.case <- loglikrat[y==1]
    loglikrat.ctrl <- loglikrat[y==0]
    mean.loglikrat <- mean(c(loglikrat.case, loglikrat.ctrl))

    ## test log-likelihood
    loglik <- y * log(posterior.p) + (1 - y) * log(1 - posterior.p)
    results <- data.frame(model=studyname,
                          casectrlrat=paste0(length(loglikrat.case), " / ",
                                             length(loglikrat.ctrl)),
                          test.loglik=round(tobits(sum(loglik)), 2),
                          auroc=round(auroc, 3),
                          loglikrat.all=round(tobits(mean.loglikrat), 2),
                          ) 
    names(results) <-
        c("Model", "Cases / controls",
          "Test log-likelihood (bits)",
          "Crude C-statistic",
          "Crude Lambda (bits)")
    return(results)
}

#' Calculate weights of evidence in natural log units
#'
#' @param posterior.p Vector of posterior probabilities generated by using model
#' to predict on test data
#' @param prior.p Prior probabilities on test data.
#' @return The weight of evidence in nats for each observation.
#'
#' @examples
#' data("cleveland") # load example dataset
#' W <- with(cleveland, weightsofevidence(posterior.p, prior.p))
#' densities.unadj <- Wdensities.unadjusted(cleveland$y, W)
#' densities.adj <- Wdensities.fromraw(densities.unadj)
#' plotWdists(densities.unadj, densities.adj)
#' 
#' @export
weightsofevidence <- function(posterior.p, prior.p) {
    W <- (log(posterior.p) - log(1 - posterior.p) -
          log(prior.p / (1 - prior.p)))
    return(W)
}

#' Calculate the unadjusted smoothed densities of W in cases and in controls
#'
#' @param y Binary outcome label (0 for controls, 1 for cases).
#' @param W Weight of evidence.
#' @param range.xseq Range of points where the curves should be sampled.
#' @param x.stepsize Distance between each point.
#' @param adjust.bw Bandwidth adjustment.
#' @return unadjusted density object  
#' @export
Wdensities.unadjusted <- function(y, W, range.xseq=c(-25, 25), x.stepsize=0.01,
                                  adjust.bw=1) {
    n.ctrls <- sum(y == 0)
    n.cases <- sum(y == 1)
    if (n.ctrls + n.cases != length(y))
        stop("y contains values different from 0 or 1")

    xseq <- seq(range.xseq[1], range.xseq[2], by=x.stepsize)
    fhat.cases.raw <- fhat.ctrls.raw <- numeric(length(xseq))
    W.ctrls <- W[y == 0]
    W.cases <- W[y == 1]
    bw.ctrls <- bw.SJ(W.ctrls) * adjust.bw
    bw.cases <- bw.SJ(W.cases) * adjust.bw
    for(i in 1:length(xseq)) {
        fhat.ctrls.raw[i] <- fsmooth(xseq[i], W.ctrls, h=bw.ctrls, n.ctrls)
        fhat.cases.raw[i] <- fsmooth(xseq[i], W.cases, h=bw.cases, n.cases)
    }
    return(list(x=xseq, f.ctrls=fhat.ctrls.raw, f.cases=fhat.cases.raw,
                n.ctrls=n.ctrls, n.cases=n.cases, x.stepsize=x.stepsize))
}

#' Adjust the crude densities of weights of evidence in cases and controls
#' to make them mathematically consistent
#'
#' @param densities Unadjusted densities computed by
#'        \code{\link{Wdensities.unadjusted}}.
#'
#' @examples
#' data("cleveland") # load example dataset
#' W <- with(cleveland, weightsofevidence(posterior.p, prior.p))
#' densities.unadj <- Wdensities.unadjusted(cleveland$y, W)
#' densities.adj <- Wdensities.fromraw(densities.unadj)
#' plotWdists(densities.unadj, densities.adj)
#' 
#' @export
Wdensities.fromraw <- function(densities) {
    xseq <- densities$x
    x.stepsize <- densities$x.stepsize
    n.ctrls <- densities$n.ctrls
    n.cases <- densities$n.cases
    wts <- cbind(exp(-0.5 * xseq) * n.ctrls,
                 exp(+0.5 * xseq) * n.cases)
    wts  <- wts / rowSums(wts) # normalize weights

    optim.result <- optim(par=0, fn=error.integrals, method="L-BFGS-B",
                          densities=densities, wts=wts,
                          lower=-0.5, upper=0.5)
    theta <- optim.result$par
    cat("Optimal value of theta:", theta, "\n")

    wdens <- reweight.densities(theta, densities$f.ctrls, densities$f.cases,
                                n.ctrls, n.cases,
                                xseq, wts)

    ## mean normalizing constant
    z <- 0.5 * (sum(wdens$f.ctrls) + sum(wdens$f.cases)) * x.stepsize
    f.cases <- wdens$f.cases / z
    f.ctrls <- wdens$f.ctrls / z
    cat("f.cases normalizes to", sum(f.cases * x.stepsize), "\n")
    cat("f.ctrls normalizes to", sum(f.ctrls * x.stepsize), "\n")
    return(list(x=xseq, f.ctrls=f.ctrls, f.cases=f.cases,
                n.ctrls=n.ctrls, n.cases=n.cases, x.stepsize=x.stepsize))
}

#' Compute smoothed densities for a spike-slab mixture distribution
#'
#' @param y Binary outcome label (0 for controls, 1 for cases).
#' @param W Weight of evidence.
#' @param in.spike logical vector same length as y, TRUE if in spike component,
#' FALSE otherwise. Typically used where high proportion of values of the predictor are zero
#' @param range.xseq Range of points where the curves should be sampled.
#' @param x.stepsize Distance between each point.
#'
#' @export
Wdensities.mix <- function(y, W, in.spike, range.xseq=c(-25, 25), x.stepsize=0.01) {
    xseq <- seq(range.xseq[1], range.xseq[2], by=x.stepsize)
    n.ctrls <- sum(y == 0)
    n.cases <- sum(y == 1)
    if (n.ctrls + n.cases != length(y))
        stop("y contains values different from 0 or 1")

    Wdensity.mix.ctrls <- density.spike.slab(W[y==0], in.spike[y==0], xseq)
    Wdensity.mix.cases <- density.spike.slab(W[y==1], in.spike[y==1], xseq)
    return(list(x=xseq, f.ctrls=Wdensity.mix.ctrls$y, f.cases=Wdensity.mix.cases$y,
                n.ctrls=n.ctrls, n.cases=n.cases, x.stepsize=x.stepsize))
}

#' @keywords internal
density.spike.slab <- function(W, in.spike, xseq) {
    density.spike <- density(W[in.spike], bw="SJ", n=length(xseq),
                             from=min(xseq), to=max(xseq))
    density.slab <- density(W[!in.spike], bw="SJ", n=length(xseq),
                            from=min(xseq), to=max(xseq))
    wts.mix <- as.integer(table(in.spike))
    wts.mix <- wts.mix / sum(wts.mix)
    density.mix <- data.frame(x=xseq,
                              y=wts.mix[1] * density.slab$y +
                                wts.mix[2] * density.spike$y)
    return(density.mix)
}

#' Compute area under the ROC curve according to model-based densities
#'
#' @param densities Adjusted densities computed by
#'        \code{\link{Wdensities.fromraw}}.
#' @return The area under the model-based ROC curve computed from the densities of the
#' weight of evidence in cases and noncases.  This model-based ROC curve is always
#' concave (if the densities have been adjusted to make them mathematically consistent).  
#'
#' @seealso plotroc
#' 
#' @importFrom zoo rollmean
#'
#' @examples
#' data("cleveland") # load example dataset
#' W <- with(cleveland, weightsofevidence(posterior.p, prior.p))
#' densities.unadj <- Wdensities.unadjusted(cleveland$y, W)
#' densities.adj <- Wdensities.fromraw(densities.unadj)
#' auroc.model(densities.adj)
#' 
#' @export
auroc.model <- function(densities) {
    x.stepsize <- densities$x.stepsize
    cumfreqs.ctrls <- cumfreqs(densities$f.ctrls, densities$x, x.stepsize)
    cumfreqs.cases <- cumfreqs(densities$f.cases, densities$x, x.stepsize)
    roc.model <- data.frame(x=1 - cumfreqs.ctrls$F, y=1 - cumfreqs.cases$F)
    auroc.model <- -sum(diff(roc.model$x) * rollmean(roc.model$y, 2))
    return(auroc.model)
}

#' Compute the expected information for discrimination (expected weight of evidence)
#' from the model-based densities
#'
#' @param densities Adjusted densities computed by
#'        \code{\link{Wdensities.fromraw}}.
#' @return The model-based expected information for discrimination.
#'
#' @examples
#' data("cleveland") # load example dataset
#' W <- with(cleveland, weightsofevidence(posterior.p, prior.p))
#' densities.unadj <- Wdensities.unadjusted(cleveland$y, W)
#' densities.adj <- Wdensities.fromraw(densities.unadj)
#' lambda.model(densities.adj)
#' 
#' @export
lambda.model <- function(densities) {
    wts.ctrlscases <- with(densities, c(n.ctrls, n.cases) / (n.ctrls + n.cases))
    lambda <- sum(wts.ctrlscases * means.densities(densities)) / log(2)
    return(lambda)
}

#' Proportions of cases and controls below a given threshold of W (natural logs)
#'
#' @param densities Adjusted densities computed by
#'        \code{\link{Wdensities.fromraw}}.
#' @param w.threshold Threshold value of weight of evidence (natural logs).
#' @return numeric vector of length 2: proportions of controls and cases with weight
#' of evidence below the given threshold.
#'
#' @examples
#' data("cleveland") # load example dataset
#' W <- with(cleveland, weightsofevidence(posterior.p, prior.p))
#' densities.unadj <- Wdensities.unadjusted(cleveland$y, W)
#' densities.adj <- Wdensities.fromraw(densities.unadj)
#' w.threshold <- log(4) # threshold Bayes factor of 4
#' prop.belowthreshold(densities.adj, w.threshold)
#' 
#' @export
prop.belowthreshold <- function(densities, w.threshold) {
    x.stepsize <- densities$x.stepsize
    xseq.threshold.idx <- which(densities$x >= w.threshold)[1]
    prop.ctrls <- round(cumfreqs(densities$f.ctrls, densities$x,
                                 x.stepsize)[xseq.threshold.idx, 2], 3)
    prop.cases <- round(cumfreqs(densities$f.cases, densities$x,
                                 x.stepsize)[xseq.threshold.idx, 2], 3)
    return(c(ctrls=prop.ctrls, cases=prop.cases))
}

#' Cumulative frequency distribution
#'
#' @keywords internal
cumfreqs <- function(f, xseq, x.stepsize) {
    ## normalize f
    f <- f / sum(f * x.stepsize)
    return(data.frame(x=xseq, F=cumsum(f * x.stepsize)))
}

#' Means of densities in cases and controls
#'
#' @param densities Adjusted densities computed by
#'        \code{\link{Wdensities.fromraw}}.
#' @return numeric vector of length 2: mean densities in controls and in cases.
#'
#' @seealso: lambda.model
#' @export
means.densities <- function(densities) {
    means.ctrls <- sum(densities$x * densities$f.ctrls) / sum(densities$f.ctrls)
    means.cases <- sum(densities$x * densities$f.cases) / sum(densities$f.cases)
    return(c(ctrls=-means.ctrls, cases=means.cases))
}
pmckeigue/wevid documentation built on May 29, 2019, 5:41 a.m.