R/run_models_performance.R

#' Run Models Performance
#'
#'
#' @title Run Models Performance
#' @description This function performs the summary of perfomance of models fitter by run_models function
#' @param fit_run_model list generated by function run_models
#' @param df_valida validation data frame
#' @param verbose plot graphics of comparation between models
#' @details details
#' @importFrom caret postResample
#' @importFrom dplyr tibble
#' @importFrom ggplot2 ggplot geom_point ggtitle geom_abline
#' @importFrom ggplot2 geom_tile facet_wrap geom_text
#' @importFrom ggplot2 xlim  ylim aes labs geom_col
#' @importFrom ggplot2 scale_x_discrete scale_y_discrete
#' @importFrom ggplot2 scale_fill_gradientn
#' @importFrom tidyr gather
#' @importFrom viridis scale_color_viridis
#' @importFrom stats var
#' @examples
#' \dontrun{
#' library(labgeo)
#' library(dplyr)
#' data("c_stock")
#' vf = labgeo::factor_detect(c_stock, 40)
#' df = c_stock %>%  mutate_at(vf,funs(factor)) %>% filter(stock_10 < 10)
#' dp = data_preparation(df = df, p = 0.20, prune = 0.99)
#' treino = dp$treino
#' teste = dp$teste
#' nm = names(treino)[1]
#' vrf = recursive_feature_elimination(treino,sizes = c(5,10,15,20,25,30,40),
#' nfolds = 5,cpu_cores = 7, metric = "Rsquared")
#' vsel = c(nm,vrf$optVariables)
#' dfsel = treino %>% select(one_of(vsel))
#' models = c("ridge", "rf", "cubist","pls")
#' fit_models = run_models(df = dfsel, models = models,
#'                         cpu_cores = 7, tune_length = 5, metric = "Rsquared")
#' dfresult = run_models_performance(fit_models, df_valida = teste, verbose = T)
#' }
#' @export

run_models_performance <- function(fit_run_model,
                                   df_valida, verbose = FALSE) {
  package_inicio <- search()[ifelse(
    unlist(gregexpr("package:", search())) == 1, TRUE, FALSE)]
  if (fit_run_model[[1]]$modelType == "Classification") {
    result <- rmp_classificacao(fit_run_model, df_valida, verbose)
    package_fim <- search()[ifelse(
      unlist(gregexpr("package:", search())) == 1, TRUE, FALSE)]
    package_list <- setdiff(package_fim, package_inicio)
    if (length(package_list) > 0) for (package in package_list)
      detach(package, character.only = TRUE)
    return(result)
  } else {
    result <- rmp_regressao(fit_run_model, df_valida, verbose)
    package_fim <- search()[ifelse(unlist(
      gregexpr("package:", search())) == 1, TRUE, FALSE)]
    package_list <- setdiff(package_fim, package_inicio)
    if (length(package_list) > 0) for (package in package_list)
      detach(package, character.only = TRUE)
    return(result)
  }
}




#' Calculate statiscs of accuracy to regression models
#'
#'
#' @title Calculate statiscs of accuracy to regression models
#' @description This function Calculate statiscs of accuracy to regression models
#' @param obs vector with observated data
#' @param pred vector with predicted value
#' @details details
#' @importFrom stats cor t.test
#' @examples
#' \dontrun{
#' v.obs = c(0,25,75,100)
#' v.pred = c(1.24,70,97)
#' pred_acc(v.obs, v.pred)
#' }
#' @export
pred_acc <- function(obs, pred) {
  mu <- mean(obs)
  mbe <- mean(obs - pred) ## mean bias error ()
  mae <- mean(abs(obs - pred)) ## mean absolute error
  mse <- mean( (obs - pred) ^ 2) ## mean square error
  rme <- mbe / mu * 100 ## relative mean error
  rmae <- mae / mu * 100 ## relative mean absolute error
  rmse <- sqrt(mse) ## root mean square error
  rrmse <- rmse / mu * 100 ## relative root mean square error
  mo <- mean( (obs - mu) ^ 2)
  nse <- 1 - (mse / mo) ## Nash-Sutcliffe efficiency
  r2 <- stats::cor(obs, pred) ^ 2
  #t <- stats::t.test(obs, pred, paired = TRUE)
  vecv <- (1 - sum( (obs - pred) ^ 2) /
    sum( (obs - mean(obs)) ^ 2)) * 100 ## variance explained by models

  list(
    mean_bias_error = mbe,
    relative_mean_bias_error = rme,
    mean_absolute_error = mae,
    relative_mean_absolute_error = rmae,
    mean_square_error = mse,
    root_mean_square_error = rmse,
    relative_rmse = rrmse,
    Nash_Sutcliffe_efficiency = nse,
    variance_explained_perc = vecv,
    rsquared = r2
    #t_test = list(t)
  )
}



rmp_classificacao <- function(fit_run_model, df_valida, verbose = FALSE) {
  Freq <- var <- valor <- Prediction <- Reference <- model <- NULL
  nm <- length(fit_run_model)
  summ_model <- dplyr::tibble(
    model = character(nm), fit = list(nm), dfpredobs = list(nm),
    accuracy = numeric(nm), Kappa = numeric(nm), time_run = numeric(nm),
    byclass = list(nm), cf = list(nm),
    g1 = list(nm), g2 = list(nm)
  )
  cont = 1
  for (i in 1:length(fit_run_model)) {
    fit_md <- fit_run_model[[i]]
    if (is.null(fit_md) == FALSE) {
      v <- tryCatch({ predict(fit_md, df_valida)},
                    error = function(e){NULL})
      if (is.null(v) == FALSE) {
        ddd <- data.frame(observado = df_valida[, 1], predito = v)
        summ_model$model[i] <- fit_md$method
        summ_model$fit[i] <- list(fit_md)

        summ_model$dfpredobs[i] <- list(ddd)
        cf <- caret::confusionMatrix(ddd$predito,
                                     ddd$observado, mode = "everything")
        summ_model$accuracy[i] <- cf$overall[1]
        summ_model$Kappa[i] <- cf$overall[2]
        summ_model$byclass[i] <- list(cf$byClass)
        summ_model$time_run[i] = fit_md$times$everything[3]
        summ_model$cf[i] <- list(cf$table)
        cont = cont + 1
      }
      if (verbose == TRUE) {
        confusion <- data.frame(cf$table)
        freqcols <- prop.table(cf$table, 2) %>%
          data.frame() %>%  dplyr::rename(freq_col = Freq)
        freqrows <- prop.table(cf$table, 1) %>%
          data.frame() %>%  dplyr::rename(freq_row = Freq)
        ddd <- dplyr::left_join(freqcols, freqrows,
                                by = c("Prediction", "Reference")) %>%
          tidyr::gather(key = var, value = valor, -Prediction, -Reference)

        g1 <- ggplot(confusion, mapping = aes(x = Reference, y = Prediction)) +
          geom_tile(colour = "white", fill = "lightyellow2") +
          scale_x_discrete(name = "Actual Class") +
          scale_y_discrete(name = "Predicted Class") +
          geom_tile(
            aes(x = Reference, y = Prediction),
            data = subset(confusion, as.character(Reference) ==
                            as.character(Prediction)),
            color = "black", size = 1, fill = "red", alpha = 0.2
          ) +
          geom_text(aes(label = confusion$Freq), vjust = 1) +
          ggtitle(fit_md$method)

        g2 <- ggplot(ddd, mapping = aes(x = Reference, y = Prediction)) +
          geom_tile(aes(fill = valor), colour = "white") +
          scale_fill_gradientn(
            colours = c("lightyellow2", "white", "palegreen"),
            values = rescale(c(0, 50, 100))
          ) +
          scale_x_discrete(name = "Actual Class") +
          scale_y_discrete(name = "Predicted Class") +
          labs(fill = "Normalized\nFrequency") +
          geom_text(aes(label = round(ddd$valor, 2)), vjust = 1) +
          ggtitle(fit_md$method) +
          facet_wrap(~var)
      }
      if (verbose == TRUE) {
        print(g1)
        print(g2)
      }
    }
  }
  summ_model = summ_model[1:(cont-1), ]
  if (verbose == TRUE) {
    dfresult <- data.frame(model = summ_model$model,
                           accuracy = summ_model$accuracy,
                           kappa = summ_model$Kappa,
                           time = summ_model$time_run) %>%
      print() %>% tidyr::gather(key = var, value = valor, -model, -time)
    g1 = ggplot(dfresult, aes(x = model, y = valor, fill = model)) +
      geom_col() +
      geom_text(aes(label = round(valor, 3)), size = 2.5, vjust = 1.5) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
      facet_wrap(~ var, scales = 'free')
    print(g1)
  }
  return(summ_model)
}



## from scales package
## https://github.com/hadley/scales

zero_range <- function(x, tol = 1000 * .Machine$double.eps) {
  if (length(x) == 1)
    return(TRUE)
  if (length(x) != 2)
    stop("x must be length 1 or 2")
  if (any(is.na(x)))
    return(NA)
  if (x[1] == x[2])
    return(TRUE)
  if (all(is.infinite(x)))
    return(FALSE)
  m <- min(abs(x))
  if (m == 0)
    return(FALSE)
  abs( (x[1] - x[2]) / m) < tol
}

rescale <- function(x, to = c(0, 1),
                    from = range(x, na.rm = TRUE, finite = TRUE)) {
  if (zero_range(from) || zero_range(to)) {
    return(ifelse(is.na(x), NA, mean(to)))
  }
  (x - from[1]) / diff(from) * diff(to) + to[1]
}



#' Plot predict observed and residual
#'
#' @title Plot predict observed and residual
#' @description This function plots predict,  observed values and residual
#' @param result  return of function rum_model_performance
#' @param residual logic value indicates if plot or not residuals
#' @details details
#' @importFrom dplyr tibble
#' @importFrom ggplot2 ggplot  geom_segment geom_point geom_abline facet_wrap
#' @importFrom ggplot2 ggtitle aes theme_bw scale_color_continuous guides xlim ylim
#' @importFrom dplyr '%>%'
#' @examples
#' \dontrun{
#' plot_predict_observed_residual(result, TRUE)
#' }
#' @export
#'
plot_predict_observed_residual <- function(result, residual = FALSE) {
observado <- predito <- residuo <- NULL
    nr <- nrow(result)
  i <- 1
  for (i in 1:nr) {
    ddd <- result$dfpredobs[[i]]
    ddd$model <- result$model[i]
    if (i == 1) {
      dresult <- ddd
    } else {
      dresult <- rbind(dresult, ddd)
    }
  }
  if (residual == TRUE) {
    g1 <- dresult  %>%
      ggplot(aes( y = observado, x = predito)) +
      geom_segment(aes(y = predito,   yend = observado,
                       x = predito, xend = predito))   +
      xlim(range(c(ddd$predito, ddd$observado))) +
      ylim(range(c(ddd$predito, ddd$observado))) +
      geom_abline(slope = 1, intercept = 0) +
      geom_point(aes(y = observado), shape = 1) +
      geom_point(aes(color = abs(residuo))) +
      scale_color_continuous(low = "green", high = "red") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      guides(color = FALSE) +
      facet_wrap(~model, scales = "free") +  theme_bw()
  } else {
    g1 <- dresult  %>%
      ggplot(aes( y = observado, x = predito)) +
      geom_abline(slope = 1, intercept = 0) +
      geom_point(aes(y = observado), shape = 1) +
      geom_point(aes(color = abs(residuo))) +
      scale_color_continuous(low = "green", high = "red") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      guides(color = FALSE) +
      facet_wrap(~model, scales = "free") +  theme_bw()
  }
  #print(g1)
  return(g1)
}




#' Plot Confusion Matrix
#'
#' @title  Plot Confusion Matrix
#' @description This function  Plot Confusion Matrix
#' @param obs  observed values
#' @param pred predicted values
#' @importFrom ggplot2 ggplot geom_tile geom_text scale_fill_gradient theme
#' @importFrom ggplot2 ggtitle  element_text
#' @importFrom tidyr gather
#' @importFrom caret confusionMatrix
#' @importFrom tibble rownames_to_column
#' @importFrom knitr kable
#' @importFrom  kableExtra kable_styling scroll_box
#' @export
plot_confusio_matrix <- function(obs, pred) {
  actual  <- prediction <- freq <- y_test <- preds <- NULL
  caret::confusionMatrix(pred, obs)$table %>%
    prop.table(margin = 1) %>%
    as.data.frame.matrix() %>%
    rownames_to_column(var = "actual") %>%
    tidyr::gather(key = "prediction", value = "freq", -actual) %>%
    ggplot2::ggplot(aes(x = actual, y = prediction, fill = freq)) +
    geom_tile() +
    geom_text(aes(label = round(freq, 2)), size = 3, color = "gray20") +
    scale_fill_gradient(low = "yellow", high = "red", limits = c(0, 1),
                        name = "Relative Frequency") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    ggtitle("Confusion Matrix - Random Forest (100 Trees)")

  caret::confusionMatrix(y_test, preds)$table %>%
    as.data.frame.matrix() %>%
    kable("html") %>%
    kable_styling(bootstrap_options = c("striped"), font_size = 8) %>%
    scroll_box(height = "400px")
}


rmp_regressao <- function(fit_run_model, df_valida, verbose = FALSE) {
  predito <- observado <- model <- mbe <- mae <- rmse <- nse <-
    r2 <- var_exp <- var <- valor <- NULL
  nm <- length(fit_run_model)
  summ_model <- dplyr::tibble(
    model = character(nm), r2 = numeric(nm), rmse = numeric(nm),
    mbe = numeric(nm), mae = numeric(nm), nse = numeric(nm),
    var_exp = numeric(nm), time = numeric(nm),
    fit = list(nm), dfpredobs = list(nm)
  )

  cont = 1
  for (i in 1:length(fit_run_model)) {

    fit_md <- fit_run_model[[i]]
    if (is.null(fit_md) == FALSE) {
      v <- tryCatch({ predict(fit_md, df_valida[ ,-1])},
                    error = function(e){
                      print(paste( "Error:", conditionMessage(e)))
                      return(NULL)})
      if (is.null(v)==FALSE) {
        v = unlist(v)
        nr = nrow(df_valida)
        ddd <- data.frame(model = rep(fit_md$method, nr), observado = df_valida[, 1], predito = v,
                          residuo = abs(v - df_valida[, 1]), stringsAsFactors = FALSE) %>% na.omit()
        names(ddd)[2] <- "observado"
        names(ddd)[3] <- "predito"
        names(ddd)[4] <- "residuo"
        summ_model$model[cont] <- fit_md$method
        summ_model$fit[cont] <- list(fit_md)
        summ_model$dfpredobs[cont] <- list(ddd)
        acc <- pred_acc(ddd$predito, ddd$observado)
        summ_model$r2[cont] <- acc$rsquared
        summ_model$rmse[cont] <- acc$root_mean_square_error
        summ_model$mbe[cont] <- acc$mean_bias_error
        summ_model$mae[cont] <- acc$mean_absolute_error
        summ_model$nse[cont] <- acc$Nash_Sutcliffe_efficiency
        summ_model$var_exp[cont] <- acc$variance_explained_perc
        summ_model$time[cont] <- fit_md$times$everything[3]
        cont = cont + 1
      }
    }
  }
  summ_model = summ_model[1:(cont-1), ]
  if (verbose == TRUE) {
    dgr <- summ_model %>%
      select(model, mbe, mae, rmse, nse, r2, var_exp) %>% na.omit() %>%
      tidyr::gather(key = var, value = valor, -model)

    g1 <- ggplot2::ggplot(dgr, aes(x = model , y = valor, fill = model)) +
      ggplot2::geom_col() +
      ggplot2::geom_text(aes(label = round(valor, 3)), size = 2.5, vjust = 1.5) +
      ggplot2::facet_wrap(~var, scales = "free") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
      ggplot2::ggtitle(model)
    print(g1)
  }
  return(summ_model)
}
elpidiofilho/easyFit documentation built on May 28, 2019, 8:36 p.m.