#' Fitter function for bestfit
#'
#' This is the basic computing engine called by \code{\link{bestfit}}. This
#' function should not be used directly by the user.
#'
#' @param X a design matrix for a regression model
#' @param y the vector of the response variable
#' @param t The transformed data
#' @param p The combinations of transformed variables to be tested
#' @param response The name of the response variable
#' @return a vector with adjusted R2 to each fit
bestfitEst <- function(X, y, t, p, response){
## Montagem da matriz
M <- cbind(y, X)
colnames(M)[1] <- response
## Calculo do R2 de cada combinacao
R2 <- vector(mode = "numeric", length = nrow(p))
ll <- vector(mode = "numeric", length = nrow(p))
Akaike <- vector(mode = "numeric", length = nrow(p))
Bayes <- vector(mode = "numeric", length = nrow(p))
## Loop: a cada iteracao, verifica apenas as transformacoes que mudaram da
## iteracao anterior
for (i in seq(nrow(p))){
a <- p[i,][p[i-1,] != p[i,]]
n <- paste(a, names(a), sep = ".")
for (j in seq_along(a))
M[,names(a)[j]] <- t[,n[j]]
fit <- RcppEigen::fastLmPure(X = M[,-1], y = M[,1], method = 0L)
# computo de R2
res <- fit$residuals
R2[i] <- miscTools::rSquared(M[,response], res)
# computo de AIC e BIC
k.original <- length(fit$coefficients)
df.ll <- k.original + 1
n <- nrow(M)
ll <- 0.5*(- n*(log(2*pi) + 1 - log(n) + log(sum(res^2))))
Akaike[i] <- -2*ll + 2*df.ll
Bayes[i] <- -2*ll + log(n)*df.ll
}
## Calculo de R2 ajustado e formatacao dos dados para impressao em tela
n <- nrow(M)
gl <- fit$df.residual
R2 <- round(1-((n-1)/gl)*(1-R2), digits = 3)
AIC <- round(Akaike, digits = 2)
BIC <- round(Bayes, digits = 2)
list(adj.R2 = R2, AIC = AIC, BIC = BIC)
}
#' Best fit models
#'
#' Find best transformations of the parameters for Linear Regression.
#'
#' @param formula A standard linear regression formula, with no transformation
#' in the parameters.
#' @param data A data frame containing the variables in the model.
#' @param subset a specification of the rows to be used: defaults to all rows.
#' This can be any valid indexing vector (see \link{[.data.frame}) for the
#' rows of data or if that is not supplied, a data frame made up of the
#' variables used in \code{formula}.
#' @param transf A family of functions to be used to transform the variables in
#' the data frame, in order to find the best combination of transformation to
#' be applied to the data - usually functions of the box-cox family.
#' @examples
#' library(sf)
#' data(centro_2015)
#' centro_2015 <- st_drop_geometry(centro_2015)
#' centro_2015 <- within(centro_2015, PU <- valor/area_total)
#' best_fit <- bestfit(PU ~ quartos + suites + garagens + dist_b_mar + padrao,
#' extras = ~ poly(area_total, 2), data = centro_2015)
#' print(best_fit, n = 20)
#' s <- summary(best_fit)
#'
#' #There still may be outliers:
#' out <- car::outlierTest(s$fit) #31
#' outliers <- 31
#'
#' # There are two ways to handle with them:
#'
#' # Recalling bestfit with a subset argument ...
#' best_fit2 <- bestfit(valor ~ ., data = dados, subset = -outliers)
#'
#' # Or assigning a subset argument directly into summary.bestfit
#' s <- summary(best_fit, fit = 1, subset = -outliers)
#'
#' # The latter takes less computational effort, since it only updates the
#' # lm call of the chosen fit. The former is more precise, since it runs
#' # bestfit again without the outliers.
#'
#' @rdname bestfit
#' @export
#'
bestfit <- function(formula, extras, data, subset = NULL,
transf = c('rec', 'rsqrt', 'log', 'sqrt')){
f <- formula
DF <- as.data.frame(data)
mf <- stats::model.frame(formula = f, data = DF)
predictors <- attr(stats::terms.formula(f, data = DF), "term.labels")
response <- colnames(mf)[attr(stats::terms.formula(f, data = DF),
"response")]
parametros <- c(response, predictors)
for (i in parametros) if (is.character(DF[,i])) DF[,i] <- as.factor(DF[,i])
## Montagem da matriz X e do vetor y para o ajuste do modelo
if (!missing(extras)) {
ff <- foo ~ bar + baz
if (is.call(extras))
gg <- extras
else
gg <- parse(text=paste("~", paste(extras, collapse="+")))[[1L]]
ff[[2L]] <- f[[2L]]
ff[[3L]][[2L]] <- f[[3L]]
ff[[3L]][[3L]] <- gg[[2L]]
} else {
ff <- f
}
# Monta model frame
# cl1 <- match.call(expand.dots = FALSE)
# m <- match(c("formula", "data", "subset"), names(cl1), 0L)
# MF <- cl1[c(1L, m)]
# MF[[1L]] <- quote(stats::model.frame)
# MF <- eval(MF, parent.frame())
MF <- eval(call("model.frame", formula = ff, data = DF, subset = subset),
envir = parent.frame())
# Monta model matrix e model response
X <- stats::model.matrix(attr(MF, "terms"), data = MF)
y <- stats::model.response(MF)
t <- alltransf(data = DF, subset = subset,
select = parametros, transf = transf)
p <- allperm(data = DF, subset = subset,
select = parametros, transf = transf)
newdata <- DF[which(is.na(DF[, response])), , drop = FALSE]
#z <- bestfit.default(X, y, t, p, response)
X <- as.matrix(X)
y <- as.numeric(y)
t1 <- as.data.frame(t)
if (length(parametros) == 1)
colnames(t1) <- paste(names(t), lapply(t, names), sep = ".")
p <- as.matrix(p)
response <- as.character(response)
z <- bestfitEst(X, y, t1, p, response)
class(z) <- "bestfit"
## Join combinations with adj.R2 vector in a single data frame
z$tab <- data.frame(id = seq(nrow(p)), p)
z$tab$adj_R2 <- z$adj.R2
z$tab$AIC <- z$AIC
z$tab$BIC <- z$BIC
z$tab <- z$tab[order(z$tab[,"AIC"]),]
z$tab[,"id"] <- seq(nrow(z$tab))
z$call <- match.call()
z$subset <- if (missing(subset)) NULL else subset
z$combinations <- p
z$response <- response
z$predictors <- predictors
if (nrow(newdata) == 0) z$newdata <- NULL else z$newdata <- newdata
z
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.