R/optimFPM.R

Defines functions optimFPM

Documented in optimFPM

#' Optimization of Floating Percentile Model Parameters
#' 
#' Calculate parameter inputs that optimize benchmark performance
#' 
#' @param  data data.frame containing, at a minimum, chemical concentrations as columns and a logical \code{Hit} column classifying toxicity
#' @param  paramList character vector of column names of chemical concentration variables in \code{data}
#' @param  FN_crit numeric vector over which to optimize false negative thresholds (default = \code{seq(0.1, 0.9, by = 0.05))}
#' @param  alpha.test numeric vector of type-I error rate values over which to optimize (default = \code{seq(0.05, 0.5, by = 0.05))}
#' @param  which numeric or character indicating which type of plot to generate (see Details; default = \code{c(1, 2)})
#' @param  simplify logical; whether to generate simplified output (default = \code{TRUE})
#' @param  plot logical; whether to generate a plot to visualize the opimization results
#' @param  colors values recognizible as colors to be passed to \code{colorRampPalette} (via \code{colorGradient}) to generate a palette for plotting (default = \code{heat.colors(10)})
#' @param  colsteps integer; number of discrete steps to interpolate colors in \code{colorGradient} (default = \code{100})
#' @param  ... additional argument passed to \code{FPM}, \code{chemSig}, \code{chemSigSelect}, and \code{colorGradient}
#' @details
#' \code{optimFPM} was designed to help optimize the predictive capacity of the benchmarks generated by \code{FPM}. The default input parameters to
#' \code{FPM} (i.e., \code{FN_crit = 0.2} and \code{alpha.test = 0.05}) are arbitrary, and optimization can help to objectively establish more accurate benchmarks.
#' Graphical output from \code{optimFPM} can also help users to understand the relationship(s) between benchmark accuracy/error, \code{FN_crit}, and \code{alpha.test}.
#' 
#' Default inputs for \code{FN_crit} and \code{alpha.test} were selected to represent a reasonable range of values to test. Testing over both ranges
#' will result in a two-way optimization, which can be computationally intensive. Alternatively, \code{optimFPM} can be run for one parameter at a time
#' by specifying a single value for \code{FN_crit} or \code{alpha.test}. Note that inputting single values for both \code{FN_crit} and \code{alpha.test} will generate unhelpful results. 
#' 
#' Several metrics are used for optimization: 
#' 1. Ratio of sensitivity/specificity ("sensSpecRatio"), calculated as the minimum of the two metrics divided by the maximum of the two. Therefore, this value will always be between 0 and 1, representing the balance between correct \code{Hit==TRUE} and \code{Hit==FALSE} predictions.
#' 2. Overall reliability ("OR") (i.e., probability of correctly predicting \code{Hit} values)
#' 3. Fowlkes-Mallows Index ("FM") - an average of metrics focusing on predicting \code{Hit==TRUE}
#' 4. Matthew's Correlation Coefficient ("MCC") - a measure of the correspondence between the data and predictions analogous to a Pearson's correlation coefficient (but for binary data)
#' 
#' Graphical output will differ depending on whether or not a single value is input for \code{FN_crit} or \code{alpha.test}. Providing a single value for one
#' of the two arguments will generate a line graph, whereas providing longer vectors (i.e., length > 1) of inputs for both arguments will generate dot matrix plots using \code{colors} to generate
#' a color palette and \code{colsteps} to define the granularity of the color gradient with the palette. The order of \code{colors} will be plotted
#' from more optimal to less optimal; for example, the default of \code{heat.colors(10)} will show optimal colors as red and less optimal colors as yellower.
#' By default, multiple plots will be generated, however the \code{which} argument can control which plots are generated. Inputs
#' to \code{which} are, by default, \code{c(1, 2, 3, 4)} for the metrics noted above, and flexible character inputs also can be used to a degree.
#' Black squares indicate the optimal argument inputs; these values are also printed to the console and can be assigned to an object.
#' 
#' @return data.frame of optimized \code{FN_crit} and/or \code{alpha.test} values
#' @seealso FPM, colorGradient, colorRampPalette
#' @importFrom dplyr bind_rows
#' @importFrom graphics points 
#' @importFrom graphics lines
#' @importFrom graphics abline
#' @importFrom graphics legend
#' @importFrom graphics polygon
#' @examples
#' paramList = c("Cd", "Cu", "Fe", "Mn", "Ni", "Pb", "Zn")
#' FN_seq <- seq(0.1, 0.3, 0.05)
#' alpha_seq <- seq(0.05, 0.2, 0.05)
#' optimFPM(h.tristate, paramList, FN_seq, 0.05)
#' optimFPM(h.tristate, paramList, 0.2, alpha_seq)
#' optimFPM(h.tristate, paramList, FN_seq, alpha_seq, which=2)
#' @export
optimFPM <- function(data, 
                      paramList,
                      FN_crit = seq(0.1, 0.9, by = 0.05),
                      alpha.test = seq(0.05, 0.5, by = 0.05),
                      which = c(1, 2, 3, 4),
                      simplify = TRUE,
                      plot = TRUE,
                      colors = heat.colors(10),
                      colsteps = 100,
                      ...){
    
    if(length(FN_crit) > 1 & length(alpha.test) > 1){
        grid <- expand.grid(FN_crit, alpha.test)
        names(grid) <- c("FN_crit", "alpha.test")

        x <- apply(as.matrix(grid), 1, function(x) {
            suppressWarnings(expr = {
                tmp <- FPM(data, paramList, FN_crit = x[[1]], alpha.test = x[[2]], ...)[["FPM"]]
            })
           return(tmp)
        })
        
        z <- as.data.frame(matrix(ncol = length(paramList) + 12,
                                         nrow = 1, data = rep(NA, length(paramList) + 12)))
        
        names(z) <- c(paramList, "FN_crit", "TP", "FN", "TN", "FP", "pFN", "pFP", 
                "sens", "spec", "OR", "FM", "MCC")
        z <- do.call(rbind, lapply(x, function(i) dplyr::bind_rows(z, i)[-1,])) # remove dummy row
        row.names(z) <- NULL
        
        y <- do.call(rbind, lapply(x, function(tmp) {
            c(sensSpec = min(tmp$sens, tmp$spec)/max(tmp$sens, tmp$spec), 
              OR = tmp$OR, FM = tmp$FM, MCC = tmp$MCC)
            }))
        out <- data.frame(grid, y)
        
        min1 <- out[min(which.min(1 - out$sensSpec)),]
        min2 <- out[min(which.min(1 - out$OR)),]
        min3 <- out[min(which.min(1 - out$FM)),]
        min4 <- out[min(which.min(1 - out$MCC)),]
        
        if(plot){
            
            if(any(c(1, "sens", "spec", "sensitivity", "Sensitivity", "senSpec", "SenSpec",
                     "specificity", "Specificity", "sensSpec", "SensSpec", "sensspec", "ss", "SS") %in% which)){
                plot(out[1:2], 
                     col = colorGradient(1 - out[, "sensSpec"], colors = colors, colsteps = colsteps, na.rm = TRUE),
                     pch = 15, xlab = "FN Limit", ylab = "Alpha", cex = 3, 
                     main = paste0("Sensitivity-Specificity Ratio\nRange:", 
                                   paste(signif(range(out[, "sensSpec"], na.rm = T), 2), collapse = "-")))
                points(x = min1$FN_crit, y = min1$alpha.test, pch = 0, cex = 4, col = "black")
            }
            
            if(any(c(2, "OR", "or", "reliability", "Reliability", "rel", "Rel") %in% which)){
                plot(out[1:2], 
                     col = colorGradient(1 - out[, "OR"], colors = colors, colsteps = colsteps, na.rm = TRUE),
                     pch = 15, xlab = "FN Limit", ylab = "Alpha", cex = 3, 
                     main = paste0("Overall Reliability\nRange:", 
                                   paste(signif(range(out[, "OR"], na.rm = T), 2), collapse = "-")))
                points(x = min2$FN_crit, y = min2$alpha.test, pch = 0, cex = 4, col = "black")
            }
            
            if(any(c(3, "fm", "FM", "Fowlkes", "Fowlks", "fowlkes", "mallows", "mallow", "Mallows", "Mallow",
                     "f-m", "F-M") %in% which)){
                plot(out[1:2], 
                     col = colorGradient(1 - out[, "FM"], colors = colors, colsteps = colsteps, na.rm = TRUE),
                     pch = 15, xlab = "FN Limit", ylab = "Alpha", cex = 3, 
                     main = paste0("Fowlkes-Mallows Index\nRange:", 
                                   paste(signif(range(out[, "FM"], na.rm = T), 2), collapse = "-")))
                points(x = min3$FN_crit, y = min3$alpha.test, pch = 0, cex = 4, col = "black")
            }
            
            if(any(c(4, "Matthews", "matthews", "MCC", "mcc", "Mcc") %in% which)){
                plot(out[1:2], 
                     col = colorGradient(1 - out[, "MCC"], colors = colors, colsteps = colsteps, na.rm = TRUE),
                     pch = 15, xlab = "FN Limit", ylab = "Alpha", cex = 3,
                     main = paste0("Matthew's Correlation Coefficient\nRange:", 
                                   paste(signif(range(out[, "MCC"], na.rm = T), 2), collapse = "-")))
                points(x = min4$FN_crit, y = min4$alpha.test, pch = 0, cex = 4, col = "black")
            }
        }
        tmp <- data.frame(t(min1[1:2]), t(min2[1:2]), t(min3[1:2]), t(min4[1:2]))
        names(tmp) <- c("sensSpec", "OR", "FM", "MCC")
        
        if(simplify){
            return(tmp)
        } else {
            return(list(z, tmp))
        }
    } else if(length(FN_crit) > 1 & length(alpha.test) == 1) {
        if(any(FN_crit >  1) | any(FN_crit <= 0)){
            stop("FN_crit values must be between 0 and 1")
        }
        
        tmp <- suppressWarnings(expr = {
            sapply(FN_crit, function(x) {
                FPM(data = data, paramList = paramList, FN_crit = x, alpha.test = alpha.test, ...)[["FPM"]]
            })
        })
        pFN <- unlist(tmp["pFN",])
        pFP <- unlist(tmp["pFP",])
        sens <- unlist(tmp["sens",])
        spec <- unlist(tmp["spec",])
        OR <- unlist(tmp["OR",])
        FM <- unlist(tmp["FM",])
        MCC <- unlist(tmp["MCC",])
 
        sensSpec <- numeric()
        for(i in 1:length(sens)){
            sensSpec[i] <- min(c(sens[i], spec[i]))/max(c(sens[i], spec[i]))
        }
        
        pick1 <- FN_crit[which.min(1 - sensSpec)]
        pick2 <- FN_crit[which.min(1 - OR)]
        pick3 <- FN_crit[which.min(1 - FM)]
        pick4 <- FN_crit[which.min(1 - MCC)]
        
        z <- as.data.frame(matrix(ncol = length(paramList) + 12,
                                         nrow = 1, data = rep(NA, length(paramList) + 12)))

        names(z) <- c(paramList, "FN_crit", "TP", "FN", "TN", "FP", "pFN", "pFP", 
                "sens", "spec", "OR", "FM", "MCC")
        out <- suppressWarnings(expr = {
                do.call(dplyr::bind_rows, 
                         list(z,
                         data.frame(FPM(data = data, paramList = paramList, FN_crit = pick1, ...)[["FPM"]]),
                         data.frame(FPM(data = data, paramList = paramList, FN_crit = pick2, ...)[["FPM"]]),
                         data.frame(FPM(data = data, paramList = paramList, FN_crit = pick3, ...)[["FPM"]]),
                         data.frame(FPM(data = data, paramList = paramList, FN_crit = pick4, ...)[["FPM"]])
                         )
            )[-1,]
        })
        row.names(out) <- c("sensSpec", "OR", "FM", "MCC")
        
        if(plot){
            plot(x = FN_crit, y = OR, ylim = c(0,1), 
                 xlab = "FN Limit", ylab = "Error rate / Metric Value", type = "n")
            
            lines(x = FN_crit, y = pFN, col = "darkred", type = "l", lwd = 2)
            lines(x = FN_crit, y = pFP, col = "darkblue", type = "l", lwd = 2)
            lines(x = FN_crit, y = FN_crit, col = "darkred", lty = 2)
            
            polygon(x = c(FN_crit, rev(FN_crit)), 
                    y = c(FN_crit, rev(pFN)), 
                    col = rgb(1,0,0,0.1), border = F)
            
            lines(x = FN_crit, y = sensSpec, col = "darkorchid")
            lines(x = FN_crit, y = OR, col = "darkorange")
            lines(x = FN_crit, y = FM, col = "gold3")
            lines(x = FN_crit, y = MCC, col = "forestgreen")
            
            legend("top", inset = -0.1, xpd = TRUE, cex = 0.8, ncol = 2, bg = "white",
                   lty = c(1, 1, 2, 1, 1, 1, 1), lwd = c(2, 2, 1, 1, 1, 1, 1),
                   col = c("darkred", "darkblue", "darkred", 
                           "darkorchid", "darkorange", "gold3", "forestgreen"),
                   legend = c("FN rate", "FP rate", "FN threshold", "sensSpec", "OR", "FM", "MCC"))
        }
        
        if(simplify){
            return(
            c(sensSpec = out["sensSpec", "FN_crit"],
              OR = out["OR", "FN_crit"],
              FM = out["FM", "FN_crit"],
              MCC = out["MCC", "FN_crit"])
            )
        } else{
            return(list(optim = out, 
                    optim_FN = c(sensSpec = out["sensSpec", "FN_crit"],
              OR = out["OR", "FN_crit"],
              FM = out["FM", "FN_crit"],
              MCC = out["MCC", "FN_crit"])
                )
            )
        }
    } else if(length(FN_crit) == 1 & length(alpha.test) > 1){
        if(any(alpha.test >  1) | any(alpha.test <= 0)){
            stop("alpha.test values must be between 0 and 1")
        }
        
        z <- as.data.frame(matrix(ncol = length(paramList) + 13,
                                         nrow = 1, data = rep(NA, length(paramList) + 13)))
 
        names(z) <- c(paramList, "FN_crit", "TP", "FN", "TN", "FP", "pFN", "pFP", 
                "sens", "spec", "OR", "FM", "MCC", "alpha.test")
         
        tmp <-    suppressWarnings(expr = {
            dplyr::bind_rows(z, 
                sapply(alpha.test, function(x) {
                    data.frame(FPM(data = data, paramList = paramList, FN_crit = FN_crit, alpha.test = x, ...)[["FPM"]], alpha.test = x)
                    }))[-1,]
        })
        
        pFN <- (tmp[,"pFN"])
        pFP <- (tmp[,"pFP"])
        sens <- (tmp[,"sens"])
        spec <- (tmp[,"spec"])
        OR <- (tmp[,"OR"])
        FM <- (tmp[,"FM"])
        MCC <- (tmp[,"MCC"])
        
        sensSpec <- numeric()
        for(i in 1:length(sens)){
            sensSpec[i] <- min(c(sens[i], spec[i]))/max(c(sens[i], spec[i]))
        }
        pick1 <- alpha.test[which.min(1 - sensSpec)]
        pick2 <- alpha.test[which.min(1 - OR)]
        pick3 <- alpha.test[which.min(1 - FM)]
        pick4 <- alpha.test[which.min(1 - MCC)]
        
        out <- suppressWarnings(expr = {
                do.call(dplyr::bind_rows, 
                    list(z,
                         data.frame(FPM(data = data, paramList = paramList, alpha.test = pick1, ...)[["FPM"]], alpha.test = pick1),
                         data.frame(FPM(data = data, paramList = paramList, alpha.test = pick2, ...)[["FPM"]], alpha.test = pick2),
                         data.frame(FPM(data = data, paramList = paramList, alpha.test = pick3, ...)[["FPM"]], alpha.test = pick3),
                         data.frame(FPM(data = data, paramList = paramList, alpha.test = pick4, ...)[["FPM"]], alpha.test = pick4)
                         )
                )[-1,] 
            })## force data into standard template then remove template row
        
        row.names(out) <- c("sensSpec", "OR", "FM", "MCC")
        
        if(plot){
            plot(x = alpha.test, y = OR, ylim = c(0,1), 
                 xlab = "Alpha", ylab = "Error rate / Metric Value", type = "n")
            lines(x = alpha.test, y = pFN, col = "darkred", type = "l", lwd = 2)
            lines(x = alpha.test, y = pFP, col = "darkblue", type = "l", lwd = 2)
            lines(x = alpha.test, y = sensSpec, col = "darkorchid")
            lines(x = alpha.test, y = OR, col = "darkorange")
            lines(x = alpha.test, y = FM, col = "gold3")
            lines(x = alpha.test, y = MCC, col = "forestgreen")
        

            legend("top", inset = -0.1, xpd = TRUE, cex = 0.8, ncol = 2, bg = "white",
                   lty = c(1, 1, 1, 1, 1, 1), lwd = c(2, 2, 1, 1, 1, 1),
                   col = c("darkred", "darkblue", 
                           "darkorchid", "darkorange", "gold3", "forestgreen"),
                   legend = c("FN rate", "FP rate", "sensSpec", "OR", "FM", "MCC"))
        }
        
        if(simplify){
            return(
                c(sensSpec = out["sensSpec", "alpha.test"],
                  OR = out["OR", "alpha.test"],
                  FM = out["FM", "alpha.test"],
                  MCC = out["MCC", "alpha.test"])
            )
        } else{
            return(
                list(optim = out, 
                    optim_alpha = c(sensSpec = out["sensSpec", "alpha.test"],
                  OR = out["OR", "alpha.test"],
                  FM = out["FM", "alpha.test"],
                  MCC = out["MCC", "alpha.test"]))
            )
        }
    }
}## end code

Try the RFPM package in your browser

Any scripts or data that you put into this service are public.

RFPM documentation built on Oct. 26, 2023, 1:08 a.m.