#' Obtain performance statistics of a series of linear models
#'
#' @description
#' Compute several statistics measuring the performance of a series of linear models built using
#' [pedometrics::buildModelSeries()], with an option to rank the models based on one of the returned
#' performance statistics.
#'
#' @param model A list of linear models returned by [pedometrics::buildModelSeries()].
#'
#' @param design.info Extra information about the linear models in the series.
#'
#' @param arrange.by Character string defining if the table with the performance statistics of the
#' linear models should be arranged, and which column should be used. Available options are
#' `"candidates"`, `"df"`, `"aic"`, `"rmse"`, `"nrmse"`, `"r2"`, `"adj_r2"`, and `"ADJ_r2"`.
#' Descending order is used by default and cannot be changed in the current implementation. See
#' \sQuote{Value} for more information.
#'
#' @param digits Integer or vector with six integers indicating the number of decimal places to be
#' used to round the performance statistics. If a vector is passed to the function, the number of
#' decimal places should be in the following order:
#'
#' `c("aic", "rmse", "nrmse", "r2", "adj_r2", "ADJ_r2")`.
#'
#' @details
#' This function was devised to deal with a list of linear models generated by the function
#' [pedometrics::buildModelSeries()]. The main objective is to compare several linear models using
#' several performance statistics. Such statistics can then be used to rank the linear models and
#' identify, for example, the best performing model, given the selected performance statistics.
#'
#' An important feature of [pedometrics::buildModelSeries()] is that it uses the information about
#' the initial number of candidate predictor variables offered to the build the model to calculate
#' penalized or adjusted measures of model performance. Such information is recorded as an attribute
#' of the final model selected by [pedometrics::buildModelSeries()]. This feature was included in
#' [pedometrics::statsModelSeries()] because data-driven variable selection results biased linear
#' models (too optimistic), and the effective number of degrees of freedom is close to the number of
#' candidate predictor variables initially offered to the model (Harrell, 2001).
#'
#' @return
#' A data frame with several performance statistics:
#'
#' \describe{
#' \item{id}{Identification of the model.}
#' \item{candidates}{Number of candidate predictor variables initially
#' offered to the model.}
#' \item{df}{Number of degrees of freedom of the final selected model.}
#' \item{aic}{Akaike's Information Criterion (AIC). Obtained using
#' \code{extractAIC}.}
#' \item{rmse}{Root-mean squared error, calculated based on the number of
#' candidate predictor variables initially offered to the model.}
#' \item{nrmse}{Normalized Root-mean squared error, calculated as the ratio
#' between the RMSE and the standard deviation of the observed values of the
#' dependent variable.}
#' \item{r2}{Multiple coefficient of determination.}
#' \item{adj_r2}{Adjusted multiple coefficient of determination.}
#' \item{ADJ_r2}{Adjusted multiple coefficient of determination. Calculations
#' are done based on the number of candidate predictor variables initially
#' offered to the model.}
#' }
#'
#' @references
#' Harrell, F. E. (2001) _Regression modelling strategies: with applications to linear models,
#' logistic regression, and survival analysis._ First edition. New York: Springer.
#'
#' Venables, W. N. and Ripley, B. D. (2002) _Modern applied statistics with S._ Fourth edition. New
#' York: Springer.
#'
#' A. Samuel-Rosa, G. B. M. Heuvelink, G. de Mattos Vasques, and L. H. C. dos Anjos, Do more
#' detailed environmental covariates deliver more accurate soil maps?, _Geoderma_, vol. 243–244,
#' pp. 214–227, May 2015, doi: 10.1016/j.geoderma.2014.12.017.
#'
#' @author Alessandro Samuel-Rosa \email{alessandrosamuelrosa@@gmail.com}
#'
#' @section TODO:
#' * Include other performance statistics such as: PRESS, BIC, Mallow's Cp, max(VIF);
#' * Add option to select which performance statistics should be returned.
#'
#' @seealso [pedometrics::buildModelSeries()], [pedometrics::plotModelSeries()]
#'
# @section Dependencies:
# The __plyr__ package, provider of tools for splitting, applying and combining data in R, is
# required for [pedometrics::statsModelSeries()] to work. The development version of the __plyr__
# package is available on <https://github.com/hadley/plyr> while its old versions are available on
# the CRAN archive at <https://cran.r-project.org/src/contrib/Archive/plyr/>.
#'
#' @examples
# if (require(plyr)) {
#' if (interactive()) {
#' # based on the second example of function MASS:stepAIC()
#' require(MASS)
#' cpus1 <- cpus
#' for(v in names(cpus)[2:7])
#' cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])),
#' include.lowest = TRUE)
#' cpus0 <- cpus1[, 2:8] # excludes names, authors' predictions
#' cpus.samp <- sample(1:209, 100)
#' cpus.form <- list(formula(log10(perf) ~ syct + mmin + mmax + cach + chmin +
#' chmax + perf),
#' formula(log10(perf) ~ syct + mmin + cach + chmin + chmax),
#' formula(log10(perf) ~ mmax + cach + chmin + chmax + perf))
#' data <- cpus1[cpus.samp,2:8]
#' cpus.ms <- buildModelSeries(cpus.form, data, vif = TRUE, aic = TRUE)
#' cpus.des <- data.frame(a = c(0, 1, 0), b = c(1, 0, 1), c = c(1, 1, 0))
#' stats <- statsModelSeries(cpus.ms, design.info = cpus.des, arrange.by = "aic")
#' }
# FUNCTION #########################################################################################
#' @export
#' @rdname statsModelSeries
statsModelSeries <-
function(model, design.info, arrange.by, digits) {
# check if suggested packages are installed
# if (!requireNamespace("plyr")) stop("plyr package is missing")
# check function arguments
if (missing(model)) {
stop("'model' is a mandatory argument")
}
if (!inherits(model, c("list", "lm"))) {
stop("'model' should be of class list or lm")
}
if (!missing(design.info)) {
if (!inherits(design.info, "data.frame")) {
stop("'design.info' should be of class data.frame")
}
}
if (!missing(arrange.by)) {
if (!inherits(arrange.by, "character")) {
stop("'arrange.by' should be of class character")
}
}
if (inherits(model, "lm")) {
model <- list(model)
}
n <- as.numeric(sapply(model, attr, "n"))
p <- as.numeric(sapply(model, attr, "p"))
df <- sapply(model, stats::extractAIC)[1, ]
aic <- sapply(model, stats::extractAIC)[2, ]
res <- sapply(model, stats::residuals)
rmse <- rep(NA, ncol(res))
nrmse <- rep(NA, ncol(res))
sd_y <- stats::sd(data.frame(lapply(model, stats::model.frame))[, 1])
for (i in seq_len(ncol(res))) {
rmse[i] <- sqrt(sum(res[, i] * res[, i]) / (n[i] - p[i]))
nrmse[i] <- rmse[i] / sd_y
}
r2 <- as.numeric(unlist(sapply(model, summary)["r.squared", ]))
adj_r2 <- as.numeric(unlist(sapply(model, summary)["adj.r.squared", ]))
ADJ_r2 <- adjR2(r2, n, p = c(p - 1))
id <- seq(1, length(n), 1)
if (!missing(digits)) {
if (length(digits) == 6) {
aic <- round(aic, digits[1])
rmse <- round(rmse, digits[2])
nrmse <- round(nrmse, digits[3])
r2 <- round(r2, digits[4])
adj_r2 <- round(adj_r2, digits[5])
ADJ_r2 <- round(ADJ_r2, digits[6])
} else {
aic <- round(aic, digits)
rmse <- round(rmse, digits)
nrmse <- round(nrmse, digits)
r2 <- round(r2, digits)
adj_r2 <- round(adj_r2, digits)
ADJ_r2 <- round(ADJ_r2, digits)
}
}
if (!missing(design.info)) {
candidates <- p
tab <- data.frame(
cbind(id, design.info, candidates, df, aic, rmse, nrmse, r2, adj_r2, ADJ_r2),
stringsAsFactors = FALSE)
} else {
tab <- data.frame(id, candidates = p, df, aic, rmse, nrmse, r2, adj_r2, ADJ_r2,
stringsAsFactors = FALSE)
}
if (!missing(arrange.by)) {
# tab <- plyr::arrange(tab, plyr::desc(tab[, arrange.by]))
idx_arrange <- order(tab[[arrange.by]], decreasing = TRUE)
tab <- tab[idx_arrange, ]
}
return(tab)
}
#' @export
#' @rdname statsModelSeries
statsMS <- statsModelSeries
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.