appraiseR: R Package For Real Estate Appraisals

Documented in bestfit bestfitEst

#' Fitter function for bestfit
#'
#' This is the basic computing engine called by \code{\link{bestfit}}. This
#' function should not be used directly by the user.
#'
#' @param X a design matrix for a regression model
#' @param y the vector of the response variable
#' @param t The transformed data
#' @param p The combinations of transformed variables to be tested
#' @param response The name of the response variable
#' @return a vector with adjusted R2 to each fit

bestfitEst <- function(X, y, t, p, response){

  ## Montagem da matriz
  M <- cbind(y, X)
  colnames(M)[1] <- response

  ## Calculo do R2 de cada combinacao
  R2 <- vector(mode = "numeric", length = nrow(p))
  ll <- vector(mode = "numeric", length = nrow(p))
  Akaike <- vector(mode = "numeric", length = nrow(p))
  Bayes <- vector(mode = "numeric", length = nrow(p))

  ## Loop: a cada iteracao, verifica apenas as transformacoes que mudaram da
  ## iteracao anterior
  for (i in seq(nrow(p))){
    a <- p[i,][p[i-1,] != p[i,]]
    n <- paste(a, names(a), sep = ".")
    for (j in seq_along(a))
      M[,names(a)[j]] <- t[,n[j]]
    fit <- RcppEigen::fastLmPure(X = M[,-1], y = M[,1], method = 0L)
    # computo de R2
    res <- fit$residuals
    R2[i] <- miscTools::rSquared(M[,response], res)
    # computo de AIC e BIC
    k.original <- length(fit$coefficients)
    df.ll <- k.original + 1
    n <- nrow(M)
    ll <- 0.5*(- n*(log(2*pi) + 1 - log(n) + log(sum(res^2))))
    Akaike[i] <- -2*ll + 2*df.ll
    Bayes[i] <- -2*ll + log(n)*df.ll
  }

  ## Calculo de R2 ajustado e formatacao dos dados para impressao em tela
  n <- nrow(M)
  gl <- fit$df.residual
  R2 <- round(1-((n-1)/gl)*(1-R2), digits = 3)
  AIC <- round(Akaike, digits = 2)
  BIC <- round(Bayes, digits = 2)
  list(adj.R2 = R2, AIC = AIC, BIC = BIC)
}

#' Best fit models
#'
#' Find best transformations of the parameters for Linear Regression.
#'
#' @param formula A standard linear regression formula, with no transformation
#'   in the parameters.
#' @param data A data frame containing the variables in the model.
#' @param subset a specification of the rows to be used: defaults to all rows.
#'   This can be any valid indexing vector (see \link{[.data.frame}) for the
#'   rows of data or if that is not supplied, a data frame made up of the
#'   variables used in \code{formula}.
#' @param transf A family of functions to be used to transform the variables in
#'   the data frame, in order to find the best combination of transformation to
#'   be applied to the data - usually functions of the box-cox family.
#' @examples
#' library(sf)
#' data(centro_2015)
#' centro_2015 <- st_drop_geometry(centro_2015)
#' centro_2015 <- within(centro_2015, PU <- valor/area_total)
#' best_fit <- bestfit(PU ~ quartos + suites + garagens + dist_b_mar + padrao,
#'                     extras = ~ poly(area_total, 2), data = centro_2015)
#' print(best_fit, n = 20)
#' s <- summary(best_fit)
#'
#' #There still may be outliers:
#' out <- car::outlierTest(s$fit) #31
#' outliers <- 31
#'
#' # There are two ways to handle with them:
#'
#' # Recalling bestfit with a subset argument ...
#' best_fit2 <- bestfit(valor ~ ., data = dados, subset = -outliers)
#'
#' # Or assigning a subset argument directly into summary.bestfit
#'  s <- summary(best_fit, fit = 1, subset = -outliers)
#'
#' # The latter takes less computational effort, since it only updates the
#' # lm call of the chosen fit. The former is more precise, since it runs
#' # bestfit again without the outliers.
#'
#' @rdname bestfit
#' @export
#'
bestfit <- function(formula, extras, data, subset = NULL,
                            transf = c('rec', 'rsqrt', 'log', 'sqrt')){
  f <- formula
  DF <- as.data.frame(data)
  mf <- stats::model.frame(formula = f, data = DF)
  predictors <- attr(stats::terms.formula(f, data = DF), "term.labels")
  response <- colnames(mf)[attr(stats::terms.formula(f, data = DF),
                                "response")]
  parametros <- c(response, predictors)

  for (i in parametros) if (is.character(DF[,i])) DF[,i] <- as.factor(DF[,i])

  ## Montagem da matriz X e do vetor y para o ajuste do modelo

  if (!missing(extras)) {
    ff <- foo ~ bar + baz
    if (is.call(extras))
      gg <- extras
    else
      gg <- parse(text=paste("~", paste(extras, collapse="+")))[[1L]]
    ff[[2L]] <- f[[2L]]
    ff[[3L]][[2L]] <- f[[3L]]
    ff[[3L]][[3L]] <- gg[[2L]]
  } else {
    ff <- f
  }

  # Monta model frame
  # cl1 <- match.call(expand.dots = FALSE)
  # m <- match(c("formula", "data", "subset"), names(cl1), 0L)
  # MF <- cl1[c(1L, m)]
  # MF[[1L]] <- quote(stats::model.frame)
  # MF <- eval(MF, parent.frame())
  MF <- eval(call("model.frame", formula = ff, data = DF, subset = subset),
               envir = parent.frame())

  # Monta model matrix e model response
  X <- stats::model.matrix(attr(MF, "terms"), data = MF)
  y <- stats::model.response(MF)

  t <- alltransf(data = DF, subset = subset,
                 select = parametros, transf = transf)
  p <- allperm(data = DF, subset = subset,
               select = parametros, transf = transf)

  newdata <-  DF[which(is.na(DF[, response])), , drop = FALSE]

  #z <- bestfit.default(X, y, t, p, response)

  X <- as.matrix(X)
  y <- as.numeric(y)
  t1 <- as.data.frame(t)
  if (length(parametros) == 1)
    colnames(t1) <- paste(names(t), lapply(t, names), sep = ".")
  p <- as.matrix(p)
  response <- as.character(response)

  z <- bestfitEst(X, y, t1, p, response)

  class(z) <- "bestfit"

  ## Join combinations with adj.R2 vector in a single data frame
  z$tab <- data.frame(id = seq(nrow(p)), p)
  z$tab$adj_R2 <- z$adj.R2
  z$tab$AIC <- z$AIC
  z$tab$BIC <- z$BIC
  z$tab <- z$tab[order(z$tab[,"AIC"]),]
  z$tab[,"id"] <- seq(nrow(z$tab))

  z$call <- match.call()
  z$subset <- if (missing(subset)) NULL else subset
  z$combinations <- p
  z$response <- response
  z$predictors <- predictors
  if (nrow(newdata) == 0) z$newdata <- NULL else z$newdata <- newdata
  z
}