pedometrics: Miscellaneous Pedometric Tools

Documented in statsModelSeries

#' Obtain performance statistics of a series of linear models
#'
#' @description
#' Compute several statistics measuring the performance of a series of linear models built using
#' [pedometrics::buildModelSeries()], with an option to rank the models based on one of the returned
#' performance statistics.
#'
#' @param model A list of linear models returned by [pedometrics::buildModelSeries()].
#'
#' @param design.info Extra information about the linear models in the series.
#'
#' @param arrange.by Character string defining if the table with the performance statistics of the
#' linear models should be arranged, and which column should be used. Available options are
#' `"candidates"`, `"df"`, `"aic"`, `"rmse"`, `"nrmse"`, `"r2"`, `"adj_r2"`, and `"ADJ_r2"`.
#' Descending order is used by default and cannot be changed in the current implementation. See
#' \sQuote{Value} for more information.
#'
#' @param digits Integer or vector with six integers indicating the number of decimal places to be
#' used to round the performance statistics. If a vector is passed to the function, the number of
#' decimal places should be in the following order: 
#' 
#' `c("aic", "rmse", "nrmse", "r2", "adj_r2", "ADJ_r2")`.
#' 
#' @details
#' This function was devised to deal with a list of linear models generated by the function
#' [pedometrics::buildModelSeries()]. The main objective is to compare several linear models using
#' several performance statistics. Such statistics can then be used to rank the linear models and
#' identify, for example, the best performing model, given the selected performance statistics.
#' 
#' An important feature of [pedometrics::buildModelSeries()] is that it uses the information about
#' the initial number of candidate predictor variables offered to the build the model to calculate
#' penalized or adjusted measures of model performance. Such information is recorded as an attribute
#' of the final model selected by [pedometrics::buildModelSeries()]. This feature was included in
#' [pedometrics::statsModelSeries()] because data-driven variable selection results biased linear
#' models (too optimistic), and the effective number of degrees of freedom is close to the number of
#' candidate predictor variables initially offered to the model (Harrell, 2001).
#' 
#' @return
#' A data frame with several performance statistics:
#' 
#' \describe{
#' \item{id}{Identification of the model.}
#' \item{candidates}{Number of candidate predictor variables initially 
#' offered to the model.}
#' \item{df}{Number of degrees of freedom of the final selected model.}
#' \item{aic}{Akaike's Information Criterion (AIC). Obtained using 
#' \code{extractAIC}.}
#' \item{rmse}{Root-mean squared error, calculated based on the number of 
#' candidate predictor variables initially offered to the model.}
#' \item{nrmse}{Normalized Root-mean squared error, calculated as the ratio 
#' between the RMSE and the standard deviation of the observed values of the 
#' dependent variable.}
#' \item{r2}{Multiple coefficient of determination.}
#' \item{adj_r2}{Adjusted multiple coefficient of determination.}
#' \item{ADJ_r2}{Adjusted multiple coefficient of determination. Calculations 
#' are done based on the number of candidate predictor variables initially 
#' offered to the model.}
#' }
#' 
#' @references
#' Harrell, F. E. (2001) _Regression modelling strategies: with applications to linear models,
#' logistic regression, and survival analysis._ First edition. New York: Springer.
#' 
#' Venables, W. N. and Ripley, B. D. (2002) _Modern applied statistics with S._ Fourth edition. New
#' York: Springer.
#' 
#' A. Samuel-Rosa, G. B. M. Heuvelink, G. de Mattos Vasques, and L. H. C. dos Anjos, Do more
#' detailed environmental covariates deliver more accurate soil maps?, _Geoderma_, vol. 243–244,
#' pp. 214–227, May 2015, doi: 10.1016/j.geoderma.2014.12.017.
#' 
#' @author Alessandro Samuel-Rosa \email{alessandrosamuelrosa@@gmail.com}
#' 
#' @section TODO:
#' * Include other performance statistics such as: PRESS, BIC, Mallow's Cp, max(VIF);
#' * Add option to select which performance statistics should be returned.
#'
#' @seealso [pedometrics::buildModelSeries()], [pedometrics::plotModelSeries()]
#' 
# @section Dependencies:
# The __plyr__ package, provider of tools for splitting, applying and combining data in R, is
# required for [pedometrics::statsModelSeries()] to work. The development version of the __plyr__
# package is available on <https://github.com/hadley/plyr> while its old versions are available on
# the CRAN archive at <https://cran.r-project.org/src/contrib/Archive/plyr/>.
#' 
#' @examples
# if (require(plyr)) {
#' if (interactive()) {
#'   # based on the second example of function MASS:stepAIC()
#'   require(MASS)
#'   cpus1 <- cpus
#'   for(v in names(cpus)[2:7])
#'     cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])), 
#'                       include.lowest = TRUE)
#'   cpus0 <- cpus1[, 2:8]  # excludes names, authors' predictions
#'   cpus.samp <- sample(1:209, 100)
#'   cpus.form <- list(formula(log10(perf) ~ syct + mmin + mmax + cach + chmin +
#'                     chmax + perf),
#'                     formula(log10(perf) ~ syct + mmin + cach + chmin + chmax),
#'                     formula(log10(perf) ~ mmax + cach + chmin + chmax + perf))
#'   data <- cpus1[cpus.samp,2:8]
#'   cpus.ms <- buildModelSeries(cpus.form, data, vif = TRUE, aic = TRUE)
#'   cpus.des <- data.frame(a = c(0, 1, 0), b = c(1, 0, 1), c = c(1, 1, 0))
#'   stats <- statsModelSeries(cpus.ms, design.info = cpus.des, arrange.by = "aic")
#' }
# FUNCTION #########################################################################################
#' @export
#' @rdname statsModelSeries
statsModelSeries <-
  function(model, design.info, arrange.by, digits) {
    # check if suggested packages are installed
    # if (!requireNamespace("plyr")) stop("plyr package is missing")
    # check function arguments
    if (missing(model)) {
      stop("'model' is a mandatory argument")
    }
    if (!inherits(model, c("list", "lm"))) {
      stop("'model' should be of class list or lm")
    }
    if (!missing(design.info)) {
      if (!inherits(design.info, "data.frame")) {
        stop("'design.info' should be of class data.frame")
      }
    }
    if (!missing(arrange.by)) {
      if (!inherits(arrange.by, "character")) {
        stop("'arrange.by' should be of class character")
      }
    }
    if (inherits(model, "lm")) {
      model <- list(model)
    }
    n <- as.numeric(sapply(model, attr, "n"))
    p <- as.numeric(sapply(model, attr, "p"))
    df <- sapply(model, stats::extractAIC)[1, ]
    aic <- sapply(model, stats::extractAIC)[2, ]
    res <- sapply(model, stats::residuals)
    rmse <- rep(NA, ncol(res))
    nrmse <- rep(NA, ncol(res))
    sd_y <- stats::sd(data.frame(lapply(model, stats::model.frame))[, 1])
    for (i in seq_len(ncol(res))) {
      rmse[i] <- sqrt(sum(res[, i] * res[, i]) / (n[i] - p[i]))
      nrmse[i] <- rmse[i] / sd_y
    }
    r2     <- as.numeric(unlist(sapply(model, summary)["r.squared", ]))
    adj_r2 <- as.numeric(unlist(sapply(model, summary)["adj.r.squared", ]))
    ADJ_r2 <- adjR2(r2, n, p = c(p - 1))
    id <- seq(1, length(n), 1)
    if (!missing(digits)) {
      if (length(digits) == 6) {
        aic <- round(aic, digits[1])
        rmse <- round(rmse, digits[2])
        nrmse <- round(nrmse, digits[3])
        r2 <- round(r2, digits[4])
        adj_r2 <- round(adj_r2, digits[5])
        ADJ_r2 <- round(ADJ_r2, digits[6])
      } else {
        aic <- round(aic, digits)
        rmse <- round(rmse, digits)
        nrmse <- round(nrmse, digits)
        r2 <- round(r2, digits)
        adj_r2 <- round(adj_r2, digits)
        ADJ_r2 <- round(ADJ_r2, digits)
      }
    }
    if (!missing(design.info)) {
      candidates <- p
      tab <- data.frame(
        cbind(id, design.info, candidates, df, aic, rmse, nrmse, r2, adj_r2, ADJ_r2),
        stringsAsFactors = FALSE)
    } else {
      tab <- data.frame(id, candidates = p, df, aic, rmse, nrmse, r2, adj_r2, ADJ_r2,
        stringsAsFactors = FALSE)
    }
    if (!missing(arrange.by)) {
      # tab <- plyr::arrange(tab, plyr::desc(tab[, arrange.by]))
      idx_arrange <- order(tab[[arrange.by]], decreasing = TRUE)
      tab <- tab[idx_arrange, ]
    }
    return(tab)
  }
#' @export
#' @rdname statsModelSeries
statsMS <- statsModelSeries