Nothing
#' Obtain performance statistics of a series of linear models
#'
#' @description
#' Compute several statistics measuring the performance of a series of linear models built using
#' [pedometrics::buildModelSeries()], with an option to rank the models based on one of the returned
#' performance statistics.
#'
#' @param model A list of linear models returned by [pedometrics::buildModelSeries()].
#'
#' @param design.info Extra information about the linear models in the series.
#'
#' @param arrange.by Character string defining if the table with the performance statistics of the
#' linear models should be arranged, and which column should be used. Available options are
#' `"candidates"`, `"df"`, `"aic"`, `"rmse"`, `"nrmse"`, `"r2"`, `"adj_r2"`, and `"ADJ_r2"`.
#' Descending order is used by default and cannot be changed in the current implementation. See
#' \sQuote{Value} for more information.
#'
#' @param digits Integer or vector with six integers indicating the number of decimal places to be
#' used to round the performance statistics. If a vector is passed to the function, the number of
#' decimal places should be in the following order:
#'
#' `c("aic", "rmse", "nrmse", "r2", "adj_r2", "ADJ_r2")`.
#'
#' @details
#' This function was devised to deal with a list of linear models generated by the function
#' [pedometrics::buildModelSeries()]. The main objective is to compare several linear models using
#' several performance statistics. Such statistics can then be used to rank the linear models and
#' identify, for example, the best performing model, given the selected performance statistics.
#'
#' An important feature of [pedometrics::buildModelSeries()] is that it uses the information about
#' the initial number of candidate predictor variables offered to the build the model to calculate
#' penalized or adjusted measures of model performance. Such information is recorded as an attribute
#' of the final model selected by [pedometrics::buildModelSeries()]. This feature was included in
#' [pedometrics::statsModelSeries()] because data-driven variable selection results biased linear
#' models (too optimistic), and the effective number of degrees of freedom is close to the number of
#' candidate predictor variables initially offered to the model (Harrell, 2001).
#'
#' @return
#' A data frame with several performance statistics:
#'
#' \describe{
#' \item{id}{Identification of the model.}
#' \item{candidates}{Number of candidate predictor variables initially
#' offered to the model.}
#' \item{df}{Number of degrees of freedom of the final selected model.}
#' \item{aic}{Akaike's Information Criterion (AIC). Obtained using
#' \code{extractAIC}.}
#' \item{rmse}{Root-mean squared error, calculated based on the number of
#' candidate predictor variables initially offered to the model.}
#' \item{nrmse}{Normalized Root-mean squared error, calculated as the ratio
#' between the RMSE and the standard deviation of the observed values of the
#' dependent variable.}
#' \item{r2}{Multiple coefficient of determination.}
#' \item{adj_r2}{Adjusted multiple coefficient of determination.}
#' \item{ADJ_r2}{Adjusted multiple coefficient of determination. Calculations
#' are done based on the number of candidate predictor variables initially
#' offered to the model.}
#' }
#'
#' @references
#' Harrell, F. E. (2001) _Regression modelling strategies: with applications to linear models,
#' logistic regression, and survival analysis._ First edition. New York: Springer.
#'
#' Venables, W. N. and Ripley, B. D. (2002) _Modern applied statistics with S._ Fourth edition. New
#' York: Springer.
#'
#' A. Samuel-Rosa, G. B. M. Heuvelink, G. de Mattos Vasques, and L. H. C. dos Anjos, Do more
#' detailed environmental covariates deliver more accurate soil maps?, _Geoderma_, vol. 243–244,
#' pp. 214–227, May 2015, doi: 10.1016/j.geoderma.2014.12.017.
#'
#' @author Alessandro Samuel-Rosa \email{alessandrosamuelrosa@@gmail.com}
#'
#' @section TODO:
#' * Include other performance statistics such as: PRESS, BIC, Mallow's Cp, max(VIF);
#' * Add option to select which performance statistics should be returned.
#'
#' @seealso [pedometrics::buildModelSeries()], [pedometrics::plotModelSeries()]
#'
# @section Dependencies:
# The __plyr__ package, provider of tools for splitting, applying and combining data in R, is
# required for [pedometrics::statsModelSeries()] to work. The development version of the __plyr__
# package is available on <https://github.com/hadley/plyr> while its old versions are available on
# the CRAN archive at <https://cran.r-project.org/src/contrib/Archive/plyr/>.
#'
#' @examples
# if (require(plyr)) {
#' if (interactive()) {
#' # based on the second example of function MASS:stepAIC()
#' require(MASS)
#' cpus1 <- cpus
#' for(v in names(cpus)[2:7])
#' cpus1[[v]] <- cut(cpus[[v]], unique(quantile(cpus[[v]])),
#' include.lowest = TRUE)
#' cpus0 <- cpus1[, 2:8] # excludes names, authors' predictions
#' cpus.samp <- sample(1:209, 100)
#' cpus.form <- list(formula(log10(perf) ~ syct + mmin + mmax + cach + chmin +
#' chmax + perf),
#' formula(log10(perf) ~ syct + mmin + cach + chmin + chmax),
#' formula(log10(perf) ~ mmax + cach + chmin + chmax + perf))
#' data <- cpus1[cpus.samp,2:8]
#' cpus.ms <- buildModelSeries(cpus.form, data, vif = TRUE, aic = TRUE)
#' cpus.des <- data.frame(a = c(0, 1, 0), b = c(1, 0, 1), c = c(1, 1, 0))
#' stats <- statsModelSeries(cpus.ms, design.info = cpus.des, arrange.by = "aic")
#' }
# FUNCTION #########################################################################################
#' @export
#' @rdname statsModelSeries
statsModelSeries <-
function(model, design.info, arrange.by, digits) {
# check if suggested packages are installed
# if (!requireNamespace("plyr")) stop("plyr package is missing")
# check function arguments
if (missing(model)) {
stop("'model' is a mandatory argument")
}
if (!inherits(model, c("list", "lm"))) {
stop("'model' should be of class list or lm")
}
if (!missing(design.info)) {
if (!inherits(design.info, "data.frame")) {
stop("'design.info' should be of class data.frame")
}
}
if (!missing(arrange.by)) {
if (!inherits(arrange.by, "character")) {
stop("'arrange.by' should be of class character")
}
}
if (inherits(model, "lm")) {
model <- list(model)
}
n <- as.numeric(sapply(model, attr, "n"))
p <- as.numeric(sapply(model, attr, "p"))
df <- sapply(model, stats::extractAIC)[1, ]
aic <- sapply(model, stats::extractAIC)[2, ]
res <- sapply(model, stats::residuals)
rmse <- rep(NA, ncol(res))
nrmse <- rep(NA, ncol(res))
sd_y <- stats::sd(data.frame(lapply(model, stats::model.frame))[, 1])
for (i in seq_len(ncol(res))) {
rmse[i] <- sqrt(sum(res[, i] * res[, i]) / (n[i] - p[i]))
nrmse[i] <- rmse[i] / sd_y
}
r2 <- as.numeric(unlist(sapply(model, summary)["r.squared", ]))
adj_r2 <- as.numeric(unlist(sapply(model, summary)["adj.r.squared", ]))
ADJ_r2 <- adjR2(r2, n, p = c(p - 1))
id <- seq(1, length(n), 1)
if (!missing(digits)) {
if (length(digits) == 6) {
aic <- round(aic, digits[1])
rmse <- round(rmse, digits[2])
nrmse <- round(nrmse, digits[3])
r2 <- round(r2, digits[4])
adj_r2 <- round(adj_r2, digits[5])
ADJ_r2 <- round(ADJ_r2, digits[6])
} else {
aic <- round(aic, digits)
rmse <- round(rmse, digits)
nrmse <- round(nrmse, digits)
r2 <- round(r2, digits)
adj_r2 <- round(adj_r2, digits)
ADJ_r2 <- round(ADJ_r2, digits)
}
}
if (!missing(design.info)) {
candidates <- p
tab <- data.frame(
cbind(id, design.info, candidates, df, aic, rmse, nrmse, r2, adj_r2, ADJ_r2),
stringsAsFactors = FALSE)
} else {
tab <- data.frame(id, candidates = p, df, aic, rmse, nrmse, r2, adj_r2, ADJ_r2,
stringsAsFactors = FALSE)
}
if (!missing(arrange.by)) {
# tab <- plyr::arrange(tab, plyr::desc(tab[, arrange.by]))
idx_arrange <- order(tab[[arrange.by]], decreasing = TRUE)
tab <- tab[idx_arrange, ]
}
return(tab)
}
#' @export
#' @rdname statsModelSeries
statsMS <- statsModelSeries
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.