ComputePI: Compute Prediction Intervals on Simulated versus Observed Data

Documented in binning censoring nopredcorrect observed predcorrect simulated stratify vpcstats

#' Perform a Visual Predictive Check (VPC) computation
#'
#' These functions work together to claculate the statistics that are plotted
#' in a VPC. They would typically be chained together using the "pipe" operator
#' (see Examples).
#'
#' @param o An object.
#' @param ... Additional arguments.
#' @examples
#'
#' \dontrun{
#' library(data.table)
#' library(magrittr)
#' library(vpc)
#'
#' exampleobs <- as.data.table(vpc::simple_data$obs)[MDV == 0]
#' examplesim <- as.data.table(vpc::simple_data$sim)[MDV == 0]
#' exampleobs$PRED <- examplesim[REP == 1, PRED]
#' exampleobs$SEX <- rep(c("F", "M"), len=nrow(exampleobs))
#' 
#' vpc <- observed(exampleobs, x=TIME, y=DV) %>%
#'     simulated(examplesim, y=DV) %>%
#'     stratify(~ ISM) %>%
#'     binning(TIME) %>%
#'     predcorrect(pred=PRED) %>%
#'     vpcstats()
#'
#' plot(vpc)
#'
#' # Example with 2-way stratification
#'
#' vpc <- vpc %>%
#'     stratify(SEX ~ ISM) %>%
#'     binning(TIME) %>%
#'     predcorrect(pred=PRED) %>%
#'     vpcstats()
#'
#' plot(vpc)
#' }
#'
#' @import data.table
#' @import magrittr
#' @importFrom stats median model.frame quantile setNames update
#' @name generics
NULL

#' @rdname generics
#' @export
observed <- function(o, ...) UseMethod("observed")

#' @rdname generics
#' @export
simulated <- function(o, ...) UseMethod("simulated")

#' @rdname generics
#' @export
censoring <- function(o, ...) UseMethod("censoring")

#' @rdname generics
#' @export
stratify <- function(o, ...) UseMethod("stratify")

#' @rdname generics
#' @export
binning <- function(o, ...) UseMethod("binning")

#' @rdname generics
#' @export
predcorrect <- function(o, ...) UseMethod("predcorrect")

#' @rdname generics
#' @export
nopredcorrect <- function(o, ...) UseMethod("nopredcorrect")

#' @rdname generics
#' @export
vpcstats <- function(o, ...) UseMethod("vpcstats")

#' @export
update.vpcstatsobj <- function(object, ...) {
    args <- list(...)
    for (i in names(args)) {
        object[[i]] <- args[[i, exact=TRUE]]
    }
    object
}

#' @export
observed.data.frame <- function(o, x, yobs, pred=NULL, blq, lloq=-Inf, ...) {
    data <- o
    x    <- rlang::eval_tidy(rlang::enquo(x),    data)
    yobs <- rlang::eval_tidy(rlang::enquo(yobs), data)
    pred <- rlang::eval_tidy(rlang::enquo(pred), data)
    lloq <- rlang::eval_tidy(rlang::enquo(lloq), data)
    lloq <- as.numeric(lloq)

    if (missing(blq)) {
        blq <- (yobs < lloq)
    } else {
        blq  <- rlang::eval_tidy(rlang::enquo(blq),  data)
    }
    blq  <- as.logical(blq)

    obs <- data.table(x, y=yobs, blq, lloq)

    o <- structure(list(data=data), class="vpcstatsobj")
    update(o, obs=obs, pred=pred)
}

#' @export
simulated.vpcstatsobj <- function(o, data, ysim, ...) {
    ysim <- rlang::eval_tidy(rlang::enquo(ysim), data)

    obs  <- o$obs
    x    <- obs$x
    nrep <- length(ysim)/nrow(obs)
    repl <- rep(1:nrep, each=nrow(obs))

    sim <- data.table(x, y=ysim, repl)
    update(o, sim=sim)
}

#' @export
censoring.vpcstatsobj <- function(o, blq, lloq, data=o$data, ...) {
    if (missing(blq)) {
        blq <- o$blq
    } else {
        blq <- rlang::eval_tidy(rlang::enquo(blq), data)
    }
    if (is.null(blq)) {
        stop("No blq specified")
    }

    if (missing(lloq)) {
        lloq <- o$lloq
    } else {
        lloq <- rlang::eval_tidy(rlang::enquo(lloq), data)
    }
    if (is.null(lloq)) {
        stop("No lloq specified")
    }

    .blq <- blq
    .lloq <- lloq
    o$obs[, blq := rep(.blq, len=.N)]
    o$obs[, lloq := rep(.lloq, len=.N)]

    update(o, censoring=TRUE)
}

#' @export
stratify.vpcstatsobj <- function(o, formula, data=o$data, ...) {
    if (!inherits(formula, "formula")) {
        stop("Expecting a formula")
    }
    flist <- as.list(formula)
    if (length(flist) == 3) {
        lhs <- as.call(c(flist[[1]], flist[[2]]))
        rhs <- as.call(c(flist[[1]], flist[[3]]))
        if (flist[[2]] == as.symbol(".")) {
            lhsmf <- NULL
        } else {
            lhsmf <- as.data.table(model.frame(lhs, data))
        }
        if (flist[[3]] == as.symbol(".")) {
            rhsmf <- NULL
        } else {
            rhsmf <- as.data.table(model.frame(rhs, data))
        }
        if (is.null(lhsmf) && is.null(rhsmf)) {
            stop("Invalid stratification formula: no variables specified")
        }
        strat <- cbind(lhsmf, rhsmf)
    } else {
        strat <- as.data.table(model.frame(formula, data))
    }

    reserved.names <- c("x", "y", "ypc", "pred", "blq", "lloq", "repl", "bin", "xbin", "qname", "lo", "md", "hi",
        "nobs", "xmedian", "xmean", "xmin", "xmax", "xmid", "xleft", "xright", "xcenter")
    if (any(names(strat) %in% reserved.names)) {
        stop(paste0("The names of used for stratification must not include: ",
                paste0(reserved.names, collapse=", ")))
    }
    o$obs[, names(strat) := strat]
    update(o, strat=strat, strat.formula=formula)
}

#' @export
binning.vpcstatsobj <- function(o, bin, data=o$data, ..., xbin="xmedian", centers, breaks, nbins, altx, stratum=NULL, by.strata=T) {

    keep <- i <- NULL
    . <- list

    # If xbin is numeric, then that is the bin
    xbin <- rlang::eval_tidy(rlang::enquo(xbin), data)
    if (is.numeric(xbin)) {
        if (length(xbin) != nrow(o$obs)) {
            stop("A numeric xbin be of length equal to the number of observations")
        }
        bin <- xbin
    } else {
        if (missing(bin) && !missing(centers)) {
            bin <- "centers"
        } else if (missing(bin) && !missing(breaks)) {
            bin <- "breaks"
        } else {
            bin <- rlang::eval_tidy(rlang::enquo(bin), data)
        }
    }

    # If you don't want to bin on the observed x, you can specify an alternate x for binning
    if (missing(altx)) {
        x <- o$obs$x
    } else {
        x <- rlang::eval_tidy(rlang::enquo(altx), data)
    }

    if (!missing(nbins)) {
        nbins <- rlang::eval_tidy(rlang::enquo(nbins), data)
        if (is.numeric(nbins) && !is.null(o$strat) && (length(nbins) == nrow(o$strat))) {
            nbins <- data.table(nbins)[, .(nbins=unique(nbins)), by=o$strat]
        }
    }

    args <- lapply(rlang::enquos(...), rlang::eval_tidy, data=data)

    by.strata <- isTRUE(by.strata)

    # Check if specific stratum selected (can be more than 1), setup filter
    if (!is.null(stratum)) {
        if (!is.list(stratum)) {
            stop("stratum must be a list, data.frame or data.table")
        }
        if (is.null(o$strat)) {
            stop("No stratification has been specified")
        }
        if (!by.strata) {
            stop("by.strata must be TRUE when stratum is specified")
        }
        filter <- copy(o$strat)[, keep := F]
        filter[as.data.table(stratum), keep := T, on=names(stratum)]
        filter <- filter$keep
    } else {
        filter <- rep(T, nrow(o$obs))  # Keep all
    }

    # If xbin is numeric, then that is the bin
    if (is.numeric(xbin)) {
        if (length(xbin) != nrow(o$obs)) {
            stop("A numeric xbin be of length equal to the number of observations")
        }
        bin <- xbin
    } else if (is.character(bin) && length(bin) == 1) {

        known.classInt.styles <- c("fixed", "sd", "equal", "pretty", "quantile",
            "kmeans", "hclust", "bclust", "fisher", "jenks", "dpih")

        if (bin == "centers") {
            if (missing(centers)) {
                stop("centers must be specified to use this binning method")
            }
            if (!by.strata && is.data.frame(centers)) {
                stop("by.strata must be TRUE when centers is a data.frame")
            }
            bin <- nearest(centers)
        } else if (bin == "breaks") {
            if (missing(breaks)) {
                stop("breaks must be specified to use this binning method")
            }
            if (!by.strata && is.data.frame(breaks)) {
                stop("by.strata must be TRUE when breaks is a data.frame")
            }
            bin <- cut_at(breaks)
        } else if (bin == "ntile") {
            if (missing(nbins)) {
                stop("nbins must be specified to use this binning method")
            }
            bin <- bin_by_ntile(nbins)
        } else if (bin == "eqcut") {
            if (missing(nbins)) {
                stop("nbins must be specified to use this binning method")
            }
            bin <- bin_by_eqcut(nbins)
        } else if (bin == "pam") {
            if (missing(nbins)) {
                stop("nbins must be specified to use this binning method")
            }
            bin <- bin_by_pam(nbins)
        } else if (bin %in% known.classInt.styles) {
            if (missing(nbins)) {
                nbins <- NULL
            }
            bin <- bin_by_classInt(bin, nbins)
        } else {
            stop(sprintf("Unknown binning method: %s", bin))
        }
    }

    if (is.function(bin)) {
        xdat <- data.table(i=1:nrow(o$obs), x=x)
        if (any(is.na(xdat[filter]$x))) {
            warning("x contains missing values, which could affect binning")
        }
        if (any(is.infinite(xdat[filter]$x))) {
            warning("x contains non-finite values, which could affect binning")
        }
        if (by.strata && !is.null(o$strat)) {
            sdat <- copy(o$strat)
            temp <- xdat[filter, .(i=i, j=do.call(bin, c(list(x), args, .BY))), by=sdat[filter]]
            j <- temp[order(i), j]
        } else {
            j <- xdat[filter, do.call(bin, c(list(x), args))]
        }
        if (length(j) != sum(filter)) {
            stop("The binning function did not return the right number of elements")
        }
    } else if (length(bin) == nrow(o$obs)) {
        j <- bin[filter]
    } else {
        stop("Incorrect binning specification")
    }
    o$obs[filter, bin := j]
    bin <- o$obs$bin

    if (!is.null(o$strat)) {
        stratbin <- data.table(o$strat, bin)
    } else {
        stratbin <- data.table(bin)
    }
    o <- update(o, .stratbin=stratbin, bin.by.strata=by.strata)

    # Assign an x value to each bin
    if (is.numeric(xbin)) {
        xbin <- data.table(xbin=xbin)[, .(xbin = unique(xbin)), by=stratbin]
    } else if (is.character(xbin) && length(xbin) == 1) {
        bi <- bininfo(o)
        xbin <- data.table(bi[, names(stratbin), with=F], xbin=bi[[xbin]])
    } else if (is.function(xbin)) {
        xbin <- data.table(x=x)[, .(xbin = xbin(x)), by=stratbin]
    } else {
        stop("Invalid xbin")
    }
    update(o, xbin=xbin)
}

#' @export
predcorrect.vpcstatsobj <- function(o, pred, data=o$data, ..., log=FALSE) {

    ypc <- y <- NULL

    if (missing(pred)) {
        pred <- o$pred
    } else {
        pred <- rlang::eval_tidy(rlang::enquo(pred), data)
    }
    if (is.null(pred)) {
        stop("No pred specified")
    }

    stratbin <- o$.stratbin
    if (is.null(stratbin)) {
        stop("Need to specify binning before pred correction")
    }

    mpred <- data.table(stratbin, pred)
    mpred <- mpred[, mpred := median(pred), by=stratbin]
    mpred <- mpred$mpred

    if (log) {
        o$obs[, ypc := (mpred - pred) + y]
        o$sim[, ypc := (mpred - pred) + y]
    } else {
        o$obs[, ypc := (mpred/pred)*y]
        o$sim[, ypc := (mpred/pred)*y]
    }

    update(o, predcor=TRUE, pred=pred)
}

#' @export
nopredcorrect.vpcstatsobj <- function(o, ...) {
    update(o, predcor=FALSE)
}

#' @export
vpcstats.vpcstatsobj <- function(o, qpred=c(0.05, 0.5, 0.95), ..., conf.level=0.95, quantile.type=7) {

    repl <- ypc <- blq <- y <- lloq <- NULL
    . <- list

    obs      <- o$obs
    sim      <- o$sim
    predcor  <- o$predcor
    stratbin <- o$.stratbin
    xbin     <- o$xbin

    if (is.null(stratbin)) {
        stop("Need to specify binning before calling vpcstats")
    }
    if (any(is.na(stratbin$bin))) {
        warning("There are bins missing. Has binning been specified for all strata?", call.=F)
    }

    .stratbinrepl <- data.table(stratbin, sim[, .(repl)])

    myquant1 <- function(y, probs, qname=paste0("q", probs), type=quantile.type, blq=F) {
        y <- y + ifelse(blq, -Inf, 0)
        y <- quantile(y, probs=probs, type=type, names=F, na.rm=T)
        y[y == -Inf] <- NA
        data.frame(qname, y)
    }

    myquant2 <- function(y, probs, qname=paste0("q", probs), type=quantile.type) {
        y <- quantile(y, probs=probs, type=type, names=F, na.rm=T)
        setNames(as.list(y), qname)
    }

    if (isTRUE(predcor)) {
        qobs <- obs[, myquant1(ypc, probs=qpred, blq=blq), by=stratbin]
        qsim <- sim[, myquant1(ypc, probs=qpred, blq=F),   by=.stratbinrepl]
    } else {
        qobs <- obs[, myquant1(y, probs=qpred, blq=blq), by=stratbin]
        qsim <- sim[, myquant1(y, probs=qpred, blq=F),   by=.stratbinrepl]
    }

    .stratbinquant <- qsim[, !c("repl", "y")]
    qconf <- c(0, 0.5, 1) + c(1, 0, -1)*(1 - conf.level)/2
    qqsim <- qsim[, myquant2(y, probs=qconf, qname=c("lo", "md", "hi")), by=.stratbinquant]
    stats <- qobs[qqsim, on=names(.stratbinquant)]
    stats <- xbin[stats, on=names(stratbin)]
    setkeyv(stats, c(names(o$strat), "xbin"))

    if (!is.null(obs$blq) && any(obs$blq)) {
        sim[, lloq := rep(obs$lloq, len=.N)]
        sim[, blq := (y < lloq)]
        pctblqobs <- obs[, .(y=100*mean(blq)), by=stratbin]
        pctblqsim <- sim[, .(y=100*mean(blq)), by=.stratbinrepl]
        .stratbinpctblq <- pctblqsim[, !c("repl", "y")]
        qpctblqsim <- pctblqsim[, myquant2(y, probs=qconf, qname=c("lo", "md", "hi")), by=.stratbinpctblq]
        pctblq <- pctblqobs[qpctblqsim, on=names(.stratbinpctblq)]
        pctblq <- xbin[pctblq, on=names(stratbin)]
        setkeyv(pctblq, c(names(o$strat), "xbin"))
    } else {
        pctblq <- NULL
    }

    update(o, stats=stats, pctblq=pctblq, conf.level=conf.level)
}

#' Obtain information about the bins from a VPC object.
#' @param o An object.
#' @param ... Additional arguments.
#' @return A `data.table` containing the following columns:
#' \itemize{
#'   \item \code{nobs}: the number of observed data points in the bin
#'   \item \code{xmedian}: the median x-value of the observed data points in the bin
#'   \item \code{xmean}: the mean x-value of the observed data points in the bin
#'   \item \code{xmax}: the maximum x-value of the observed data points in the bin
#'   \item \code{xmin}: the minimum x-value of the observed data points in the bin
#'   \item \code{xmid}: the value halfway between `xmin` and `xmax`.
#'   x-value of the observed data points in the bin
#'   \item \code{xleft}: the value halfway between the minimum x-value of the
#'   current bin and the maximum x-value of the previous bin to the left (for
#'   the left-most bin it is the minimum x-value).
#'   \item \code{xright}: the value halfway between the maximum x-value of the
#'   current bin and the minimum x-value of the next bin to the right (for the
#'   right-most bin it is the maximum x-value).
#'   \item \code{xcenter}: the value halfway between `xleft` and `xright`.
#' }
#' In addition, if statification was performed, the stratification columns will
#' be included as well.
#' @export
bininfo <- function(o, ...) UseMethod("bininfo")

#' @describeIn bininfo Method for \code{vpcstatsobj}.
#' @param by.strata Should the calculations be done by strata? Defaults to what
#' was specified when the binning was done.
#' @export
bininfo.vpcstatsobj <- function(o, by.strata=o$bin.by.strata, ...) {

    x <- xmin <- xmax <- bin <- NULL

    f1 <- function(x) {
        nobs    <- sum(!is.na(x))
        xmedian <- median(x, na.rm=T)
        xmean   <- mean(x, na.rm=T)
        xmin    <- min(x, na.rm=T)
        xmax    <- max(x, na.rm=T)
        xmid    <- 0.5*(xmin + xmax)
        data.table(nobs, xmedian, xmean, xmin, xmax, xmid)
    }

    # Compute xleft and xright
    f2 <- function(xmin, xmax) {
        xmin    <- c(xmin, xmax[length(xmax)])
        xmax    <- c(xmin[1], xmax)
        breaks  <- 0.5*(xmin + xmax)
        xleft   <- breaks[-length(breaks)]
        xright  <- breaks[-1]
        xcenter <- 0.5*(xleft + xright)
        data.table(xleft, xright, xcenter)
    }
    if (by.strata && !is.null(o$strat)) {
        bi <- o$obs[, f1(x), by=o$.stratbin]
        setkeyv(bi, c(names(o$strat), "xmin"))
        bi[, c(.SD, f2(xmin, xmax)), by=names(o$strat)]
    } else {
        bi <- o$obs[, f1(x), by=bin]
        setkeyv(bi, "xmin")
        bi <- cbind(bi, bi[, f2(xmin, xmax)])
        bi <- bi[unique(o$.stratbin), on="bin"]
        setkeyv(bi, "xmin")
        bi[, c(names(o$.stratbin), setdiff(names(bi), names(o$.stratbin))), with=F]
    }
}

#' Print a \code{vpcstatsobj}.
#' @param x An object.
#' @param ... Further arguments can be specified but are ignored.
#' @return Returns \code{x} invisibly.
#' @export
print.vpcstatsobj <- function(x, ...) {
    if (!is.null(x$sim)) {
        nrep <- nrow(x$sim)/nrow(x$obs)
        if (isTRUE(x$predcor)) {
            cat("Prediction corrected ")
        }
        cat(sprintf("VPC with %d replicates", nrep), "\n")
    }
    cat(sprintf("Stratified by: %s", paste0(names(x$strat), collapse=", ")), "\n")
    if (!is.null(x$stats)) {
        print(x$stats)
    }
    invisible(x)
}

#' Plot a \code{vpcstatsobj}.
#' @param x An object.
#' @param show.points Should the observed data points be plotted?
#' @param show.boundaries Should the bin boundary be displayed?
#' @param show.stats Should the VPC stats be displayed?
#' @param show.binning Should the binning be displayed by coloring the observed data points by bin?
#' @param xlab A character label for the x-axis.
#' @param ylab A character label for the y-axis.
#' @param color A character vector of colors for the percentiles, from low to high.
#' @param linetype A character vector of linetyps for the percentiles, from low to high.
#' @param legend.position A character string specifying the position of the legend.
#' @param facet.scales A character string specifying the `scales` argument to use for facetting.
#' @param ... Further arguments can be specified but are ignored.
#' @return A `ggplot` object.
#' @seealso
#' \code{ggplot}
#' @export
plot.vpcstatsobj <- function(x, ..., show.points=TRUE, show.boundaries=TRUE, show.stats=!is.null(x$stats), show.binning=isFALSE(show.stats), xlab=NULL, ylab=NULL, color=c("red", "blue", "red"), linetype=c("dotted", "solid", "dashed"), legend.position="top", facet.scales="free") {

    xbin <- lo <- hi <- qname <- md <- y <- xleft <- xright <- ypc <- NULL
    . <- list

    vpc <- x

    qlvls <- levels(vpc$stats$qname)
    qlbls <- paste0(100*as.numeric(sub("^q", "", qlvls)), "%")

    if (isTRUE(vpc$predcor)) {
        ylab <- paste0(ylab, "\nPrediction Corrected")
    }

    has_ggplot2 <- requireNamespace("ggplot2", quietly=TRUE)
    if (!has_ggplot2) {
        stop("Package 'ggplot2' is required for plotting. Please install it to use this method.")
    }
    if (show.stats) {
        g <- ggplot2::ggplot(vpc$stats, ggplot2::aes(x=xbin)) +
            ggplot2::geom_ribbon(ggplot2::aes(ymin=lo, ymax=hi, fill=qname, col=qname, group=qname), alpha=0.1, col=NA) +
            ggplot2::geom_line(ggplot2::aes(y=md, col=qname, group=qname)) +
            ggplot2::geom_line(ggplot2::aes(y=y, linetype=qname), size=1) +
            ggplot2::scale_colour_manual(
                name=sprintf("Simulated Percentiles\nMedian (lines) %s%% CI (areas)", 100*vpc$conf.level),
                values=color,
                breaks=qlvls,
                labels=qlbls) +
            ggplot2::scale_fill_manual(
                name=sprintf("Simulated Percentiles\nMedian (lines) %s%% CI (areas)", 100*vpc$conf.level),
                values=color,
                breaks=qlvls,
                labels=qlbls) +
            ggplot2::scale_linetype_manual(
                name="Observed Percentiles\n(black lines)",
                values=linetype,
                breaks=qlvls,
                labels=qlbls) +
            ggplot2::guides(
                fill=ggplot2::guide_legend(order=2),
                colour=ggplot2::guide_legend(order=2),
                linetype=ggplot2::guide_legend(order=1))
    } else {
        g <- ggplot2::ggplot(vpc$strat)
    }

    g <- g + ggplot2::theme_bw() +
        ggplot2::theme(
            legend.key.width=ggplot2::unit(2, "lines"),
            legend.position=legend.position) +
        ggplot2::labs(x=xlab, y=ylab)

    if (show.points) {
        points.dat <- copy(vpc$obs)
        if (isTRUE(vpc$predcor)) {
            points.dat[, y := ypc]
        }
        if (show.binning) {
            reorder2 <- function(y, x) {
                y <- stats::reorder(y, x)
                (1:nlevels(y))[y]
            }
            points.dat[, color := reorder2(factor(bin), x), by=vpc$strat]
            points.dat[, color := factor(color)]
            g <- g + ggplot2::geom_point(data=points.dat, ggplot2::aes(x=x, y=y, color=color), size=1, alpha=0.4, show.legend=F) +
                scale_color_brewer(palette="Set1")
        } else {
            g <- g + ggplot2::geom_point(data=points.dat, ggplot2::aes(x=x, y=y), size=1, alpha=0.4)
        }
    }

    if (show.boundaries) {
        if (!is.null(vpc$strat)) {
            boundaries <- bininfo(vpc)[, .(x=sort(unique(c(xleft, xright)))), by=names(vpc$strat)]
        } else {
            boundaries <- bininfo(vpc)[, .(x=sort(unique(c(xleft, xright))))]
        }
        if (show.binning) {
            g <- g + ggplot2::geom_vline(data=boundaries, ggplot2::aes(xintercept=x), size=rel(0.5), col="gray80") + 
                ggplot2::theme(panel.grid=ggplot2::element_blank())
        }
        g <- g + ggplot2::geom_rug(data=boundaries, ggplot2::aes(x=x), sides="t", size=1)
    }

    if (!is.null(vpc$strat)) {
        if (length(as.list(vpc$strat.formula)) == 3) {
            g <- g + ggplot2::facet_grid(vpc$strat.formula, scales=facet.scales)
        } else {
            g <- g + ggplot2::facet_wrap(names(vpc$strat), scales=facet.scales)
        }
    }

    g
}

# Internal function
.check_centers <- function(centers) {
    if (is.data.frame(centers)) {
        centers <- as.data.table(centers)
        if (is.null(centers$centers)) {
            stop("centers data.frame must contain column centers")
        }
        if (any(is.na(centers$centers))) {
            stop("centers cannot contain missing values")
        }
        keycols <- setdiff(names(centers), "centers")
        setkeyv(centers, keycols)
    } else if (is.numeric(centers)) {
        if (any(is.na(centers))) {
            stop("centers cannot contain missing values")
        }
    } else {
        stop("centers must be a numeric vector or data.frame")
    }
    centers
}

# Internal function
.resolve_centers <- function(centers, ...) {
    if (is.data.table(centers)) {
        keycols <- key(centers)
        key <- as.data.table(list(...)[keycols])
        centers <- unique(centers[key]$centers)
    }
    if (is.null(centers) || !is.numeric(centers) || any(is.na(centers))) {
        stop("invalid centers")
    }
    centers
}

# Internal function
.check_breaks <- function(breaks) {
    if (is.data.frame(breaks)) {
        breaks <- as.data.table(breaks)
        if (is.null(breaks$breaks)) {
            stop("breaks data.frame must contain column breaks")
        }
        if (any(is.na(breaks$breaks))) {
            stop("breaks cannot contain missing values")
        }
        keycols <- setdiff(names(breaks), "breaks")
        setkeyv(breaks, keycols)
    } else if (is.numeric(breaks)) {
        if (any(is.na(breaks))) {
            stop("breaks cannot contain missing values")
        }
    } else {
        stop("breaks must be a numeric vector or data.frame")
    }
    breaks
}

# Internal function
.resolve_breaks <- function(breaks, ...) {
    if (is.data.table(breaks)) {
        keycols <- key(breaks)
        key <- as.data.table(list(...)[keycols])
        breaks <- breaks[key]$breaks
    }
    if (is.null(breaks) || !is.numeric(breaks) || any(is.na(breaks))) {
        stop("invalid breaks")
    }
    breaks
}

# Internal function
.check_nbins <- function(nbins) {
    if (is.data.frame(nbins)) {
        nbins <- as.data.table(nbins)
        if (is.null(nbins$nbins)) {
            stop("nbins data.frame must contain column nbins")
        }
        if (any(is.na(nbins$nbins))) {
            stop("nbins cannot contain missing values")
        }
        keycols <- setdiff(names(nbins), "nbins")
        setkeyv(nbins, keycols)
    } else if (is.numeric(nbins) && length(nbins) == 1) {
        if (any(is.na(nbins))) {
            stop("nbins cannot contain missing values")
        }
    } else {
        stop("nbins must be a numeric vector of length 1 or data.frame")
    }
    nbins
}

# Internal function
.resolve_nbins <- function(nbins, ...) {
    if (is.data.table(nbins)) {
        keycols <- key(nbins)
        key <- as.data.table(list(...)[keycols])
        nbins <- unique(nbins[key]$nbins)
    }
    if (is.null(nbins) || !(is.numeric(nbins) && length(nbins) == 1 && !is.na(nbins))) {
        stop("nbins must be uniquely determined")
    }
    nbins
}

#' Different functions that perform binning.
#'
#' @param breaks A numeric vector of values that designate cut points between bins.
#' @param centers A numeric vector of values that designate the center of each bin.
#' @param nbins The number of bins to split the data into.
#' @param style a binning style (see ?classInt::classIntervals for details).
#' @return Each of these functions returns a function of a single numeric
#' vector `x` that assigns each value of `x` to a bin.
#' @examples
#'
#' x <- c(rnorm(10, 1, 1), rnorm(10, 3, 2), rnorm(20, 5, 3))
#' centers <- c(1, 3, 5)
#' nearest(centers)(x)
#'
#' breaks <- c(2, 4)
#' cut_at(breaks)(x)
#'
#' bin_by_eqcut(nbins=4)(x)
#' bin_by_ntile(nbins=4)(x)
#'
#' \dontrun{
#' bin_by_pam(nbins=4)(x)
#' bin_by_classInt("pretty", nbins=4)(x)
#' }
#'
#' @name binningfunctions
NULL

#' @rdname binningfunctions
#' @export
cut_at <- function(breaks) {
    breaks <- .check_breaks(breaks)
    function(x, ..., right=F) {
        breaks <- .resolve_breaks(breaks, ...)
        breaks <- sort(unique(breaks))
        if (min(x) < min(breaks)) {
            breaks <- c(min(x), breaks)
        }
        if (max(x) > max(breaks)) {
            breaks <- c(breaks, max(x))
        }
        as.character(cut(x, breaks, include.lowest=T, right=right))
    }
}

#' @rdname binningfunctions
#' @export
nearest <- function(centers) {
    centers <- .check_centers(centers)
    function(x, ...) {
        centers <- .resolve_centers(centers, ...)
        centers <- sort(unique(centers))
        dist <- function(a, b) abs(a - b)
        d <- outer(x, centers, dist)
        m <- apply(d, 1, which.min)
        centers[m]
    }
}

#' @rdname binningfunctions
#' @export
bin_by_ntile <- function(nbins) {
    nbins <- .check_nbins(nbins)
    function(x, ...) {
        nbins <- .resolve_nbins(nbins, ...)

        # Mimic the function from dplyr
        len <- sum(!is.na(x))
        r <- rank(x, ties.method="first", na.last="keep")
        as.integer(floor(nbins*(r - 1)/len + 1))
    }
}

#' @rdname binningfunctions
#' @export
bin_by_eqcut <- function(nbins) {
    nbins <- .check_nbins(nbins)
    function(x, ..., quantile.type=7) {
        nbins <- .resolve_nbins(nbins, ...)

        # Mimic the function from table1
        breaks <- quantile(x, probs=seq.int(nbins - 1)/nbins, na.rm=T, type=quantile.type)
        cut_at(breaks)(x)
    }
}

#' @rdname binningfunctions
#' @export
bin_by_pam <- function(nbins) {
    has_cluster <- requireNamespace("cluster", quietly=TRUE)
    if (!has_cluster) {
        stop("Package 'cluster' is required to use the binning method. Please install it.")
    }
    nbins <- .check_nbins(nbins)
    function(x, ...) {
        nbins <- .resolve_nbins(nbins, ...)

        centers <- sort(cluster::pam(x, nbins)$medoids)
        nearest(centers)(x)
    }
}

#' @rdname binningfunctions
#' @export
bin_by_classInt <- function(style, nbins=NULL) {
    has_classInt <- requireNamespace("classInt", quietly=TRUE)
    if (!has_classInt) {
        stop("Package 'classInt' is required to use the binning method. Please install it.")
    }
    style <- style
    if (!is.null(nbins)) {
        nbins <- .check_nbins(nbins)
    }
    function(x, ...) {
        args <- list(var=x, style=style)
        if (!is.null(nbins)) {
            nbins <- .resolve_nbins(nbins, ...)
            args$n <- nbins
        }
        args <- c(args, list(...))
        if (style %in% c("kmeans", "hclust", "dpih")) {
            # These don't accept '...' arguments
            args1 <- args[intersect(names(args), methods::formalArgs(classInt::classIntervals))]
            args2 <- if (style == "kmeans") {
                args[intersect(names(args), methods::formalArgs(stats::kmeans))]
            } else if (style == "hclust") {
                args[intersect(names(args), methods::formalArgs(stats::hclust))]
            } else if (style == "dpih") {
                has_KernSmooth <- requireNamespace("KernSmooth", quietly=TRUE)
                if (!has_KernSmooth) {
                    stop("Package 'KernSmooth' is required to use the binning method. Please install it.")
                }
                args[intersect(names(args), methods::formalArgs(KernSmooth::dpih))]
            } else {
                list()
            }
            args <- c(args1, args2)
        }
        args <- args[!duplicated(args)]
        breaks <- do.call(classInt::classIntervals, args)$brks
        cut_at(breaks)(x)
    }
}

#' Perform a consistency check on observed and simulated data.
#' 
#' This function performs a simple consistency check on an observed and
#' simulated dataset to make sure they are consistent with respect to ordering
#' as required by the other functions used in the VPC calculation.
#'
#' The consistency check is performed by comparing a combination of unique
#' subject identifier (ID) and time. Both `data.frame`s must be given with
#' those in positions 1 and 2 repectively.
#'
#' @param obs,sim A `data.frame` with 2 columns (see Details).
#' @param tol A tolerance for comparing time values.
#' @return The number of replicates contained in `sim`.
#' @seealso \code{\link{observed}}, \code{\link{simulated}}.
#' @examples
#'
#' \dontrun{
#' library(vpc)
#'
#' exampleobs <- as.data.table(vpc::simple_data$obs)[MDV == 0]
#' examplesim <- as.data.table(vpc::simple_data$sim)[MDV == 0]
#'
#' check_order(exampleobs[, .(ID, TIME)], examplesim[, .(ID, TIME)])
#' }
#' @export
check_order <- function(obs, sim, tol=1e-5) {
    if (nrow(sim) %% nrow(obs) != 0) {
        stop("Rows in sim is not a multiple of rows in obs")
    }
    if (is.numeric(obs[[1]])) obs[[1]] <- as.numeric(obs[[1]])
    if (is.numeric(sim[[1]])) sim[[1]] <- as.numeric(sim[[1]])
    if (is.factor(obs[[1]])) obs[[1]] <- as.character(obs[[1]])
    if (is.factor(sim[[1]])) sim[[1]] <- as.character(sim[[1]])
    if (!identical(rep(obs[[1]], len=nrow(sim)), sim[[1]])) {
        stop("ID columns are not identical")
    } 
    if (!all(abs(rep(obs[[2]], len=nrow(sim)) - sim[[2]]) < tol)) {
        stop("Time columns are not equal")
    } 
    nrow(sim) / nrow(obs)
}
smouksassi/ComputePI documentation built on Nov. 10, 2019, 2:46 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
smouksassi/ComputePI
Compute Prediction Intervals on Simulated versus Observed Data

R/vpcstats2.R
In smouksassi/ComputePI: Compute Prediction Intervals on Simulated versus Observed Data

Defines functions observed simulated censoring stratify binning predcorrect nopredcorrect vpcstats update.vpcstatsobj observed.data.frame simulated.vpcstatsobj censoring.vpcstatsobj stratify.vpcstatsobj binning.vpcstatsobj

Documented in binning censoring nopredcorrect observed predcorrect simulated stratify vpcstats

R Package Documentation

Browse R Packages

We want your feedback!

smouksassi/ComputePI Compute Prediction Intervals on Simulated versus Observed Data

R/vpcstats2.R In smouksassi/ComputePI: Compute Prediction Intervals on Simulated versus Observed Data

Defines functions observed simulated censoring stratify binning predcorrect nopredcorrect vpcstats update.vpcstatsobj observed.data.frame simulated.vpcstatsobj censoring.vpcstatsobj stratify.vpcstatsobj binning.vpcstatsobj

Documented in binning censoring nopredcorrect observed predcorrect simulated stratify vpcstats

R Package Documentation

Browse R Packages

We want your feedback!

smouksassi/ComputePI
Compute Prediction Intervals on Simulated versus Observed Data

R/vpcstats2.R
In smouksassi/ComputePI: Compute Prediction Intervals on Simulated versus Observed Data