R/run_models_performance.R

#' Run Models Performance
#'
#'
#' @title Run Models Performance
#' @description This function performs the summary of perfomance of models fitter by run_models function
#' @param fit_run_model list generated by function run_models
#' @param df_valida validation data frame
#' @param verbose plot graphics of comparation between models
#' @details details
#' @importFrom caret postResample
#' @importFrom dplyr tibble
#' @importFrom ggplot2 ggplot geom_point ggtitle geom_abline
#' @importFrom ggplot2 geom_tile facet_wrap geom_text
#' @importFrom ggplot2 xlim  ylim aes labs geom_col
#' @importFrom ggplot2 scale_x_discrete scale_y_discrete
#' @importFrom ggplot2 scale_fill_gradientn
#' @importFrom tidyr gather
#' @importFrom stats var
#' @examples
#' \dontrun{
#' library(labgeo)
#' library(dplyr)
#' data("c_stock")
#' vf = labgeo::factor_detect(c_stock, 40)
#' df = c_stock %>%  mutate_at(vf,funs(factor)) %>% filter(stock_10 < 10)
#' dp = data_preparation(df = df, p = 0.20, prune = 0.99)
#' treino = dp$treino
#' teste = dp$teste
#' nm = names(treino)[1]
#' vrf = recursive_feature_elimination(treino,sizes = c(5,10,15,20,25,30,40),
#' nfolds = 5,cpu_cores = 7, metric = "Rsquared")
#' vsel = c(nm,vrf$optVariables)
#' dfsel = treino %>% select(one_of(vsel))
#' models = c("ridge", "rf", "cubist","pls")
#' fit_models = run_models(df = dfsel, models = models,
#'                         cpu_cores = 7, tune_length = 5, metric = "Rsquared")
#' dfresult = run_models_performance(fit_models, df_valida = teste, verbose = T)
#' }
#' @export

# importFrom viridis scale_color_viridis
run_models_performance <- function(fit_run_model, df_valida, verbose = FALSE) {
  package_inicio <- search()[ifelse(unlist(gregexpr("package:", search())) == 1,
                                    TRUE, FALSE)]

  if (fit_run_model[[1]]$modelType == "Classification") {
    result <- rmp_classificacao(fit_run_model, df_valida, verbose)
    package_fim <- search()[ifelse(unlist(gregexpr("package:", search())) == 1,
                                   TRUE, FALSE)]
    package_list <- setdiff(package_fim, package_inicio)
    if (length(package_list) > 0) {
      for (package in package_list) {
        detach(package, character.only = TRUE)
      }
    }
    return(result)
  } else {
    result <- rmp_regressao(fit_run_model, df_valida, verbose)
    package_fim <- search()[ifelse(unlist(gregexpr("package:", search())) == 1,
                                   TRUE, FALSE)]
    package_list <- setdiff(package_fim, package_inicio)
    if (length(package_list) > 0) {
      for (package in package_list) {
        detach(package, character.only = TRUE)
      }
    }
    return(result)
  }
}




#' Calculate statiscs of accuracy to regression models
#'
#'
#' @title Calculate statiscs of accuracy to regression models
#' @description This function Calculate statiscs of accuracy to regression models
#' @param obs vector with observated data
#' @param pred vector with predicted value
#' @details details
#' @importFrom stats cor t.test
#' @examples
#' \dontrun{
#' v.obs = c(0,25,75,100)
#' v.pred = c(1.24,70,97)
#' pred_acc(v.obs, v.pred)
#' }
#' @export
pred_acc <- function(obs, pred) {
  mu <- mean(obs)
  mbe <- mean(obs - pred)
  mae <- mean(abs(obs - pred))
  mse <- mean( (obs - pred) ^ 2)
  rme <- mbe / mu * 100
  rmae <- mae / mu * 100
  rmse <- sqrt(mse)
  rrmse <- rmse / mu * 100
  mo <- mean( (obs - mu) ^ 2)
  nse <- 1 - (mse / mo)
  r2 <- stats::cor(obs, pred) ^ 2

  vecv <- (1 - sum( (obs - pred) ^ 2) /
    sum( (obs - mean(obs)) ^ 2)) * 100

  list(
    mean_bias_error = mbe,
    relative_mean_bias_error = rme,
    mean_absolute_error = mae,
    relative_mean_absolute_error = rmae,
    mean_square_error = mse,
    root_mean_square_error = rmse,
    relative_rmse = rrmse,
    Nash_Sutcliffe_efficiency = nse,
    variance_explained_perc = vecv,
    rsquared = r2
  )
}



rmp_classificacao <- function(fit_run_model, df_valida, verbose = FALSE) {
  Freq <- var <- value <- Prediction <- Reference <- model <- time <- NULL

  nm <- length(fit_run_model)

  summ_model <- dplyr::tibble(
    model = character(nm), fit = list(nm), dfpredobs = list(nm),
    accuracy = numeric(nm), Kappa = numeric(nm), time_run = numeric(nm),
    byclass = list(nm), cf = list(nm),
    g1 = list(nm), g2 = list(nm)
  )

  cont <- 1

  for (i in seq_along(fit_run_model)) {
    fit_md <- fit_run_model[[i]]
    if (is.null(fit_md) == FALSE) {
      v <- tryCatch({
          predict(fit_md, df_valida[, -1])
        },
        error = function(e) {
          print(" ")
          print(e)
          NULL
        }
      )

      if (is.null(v) == FALSE) {
        ddd <- data.frame(vobs = df_valida[, 1], vpred = v)
        summ_model$model[cont] <- fit_md$method
        summ_model$fit[cont] <- list(fit_md)

        summ_model$dfpredobs[cont] <- list(ddd)
        cf <- caret::confusionMatrix(
          ddd$vpred,
          ddd$vobs, mode = "everything"
        )
        summ_model$accuracy[cont] <- cf$overall[1]
        summ_model$Kappa[cont] <- cf$overall[2]
        summ_model$byclass[cont] <- list(cf$byClass)
        summ_model$time_run[cont] <- fit_md$times$everything[3]
        summ_model$cf[cont] <- list(cf$table)
        cont <- cont + 1
      }
    }
    if (verbose == TRUE) {
      confusion <- data.frame(cf$table)
      freqcols <- prop.table(cf$table, 2) %>%
        data.frame() %>%
        dplyr::rename(freq_col = Freq)
      freqrows <- prop.table(cf$table, 1) %>%
        data.frame() %>%
        dplyr::rename(freq_row = Freq)
      ddd <- dplyr::left_join(
        freqcols, freqrows,
        by = c("Prediction", "Reference")
      ) %>%
        tidyr::gather(key = var, value = value, -Prediction, -Reference)

      g1 <- ggplot(confusion, mapping = aes(x = Reference, y = Prediction)) +
        geom_tile(colour = "white", fill = "lightyellow2") +
        scale_x_discrete(name = "Actual Class") +
        scale_y_discrete(name = "Predicted Class") +
        geom_tile(
          aes(x = Reference, y = Prediction),
          data = subset(confusion, as.character(Reference) ==
            as.character(Prediction)),
          color = "black", size = 1, fill = "red", alpha = 0.2
        ) +
        geom_text(aes(label = confusion$Freq), vjust = 1) +
        ggtitle(fit_md$method)

      g2 <- ggplot(ddd, mapping = aes(x = Reference, y = Prediction)) +
        geom_tile(aes(fill = value), colour = "white") +
        scale_fill_gradientn(
          colours = c("lightyellow2", "white", "palegreen"),
          values = rescale(c(0, 50, 100))
        ) +
        scale_x_discrete(name = "Actual Class") +
        scale_y_discrete(name = "Predicted Class") +
        labs(fill = "Normalized\nFrequency") +
        geom_text(aes(label = round(ddd$value, 3)), vjust = 1) +
        ggtitle(fit_md$method) +
        facet_wrap(~var)
    }
    if (verbose == TRUE) {
      print(g1)
      print(g2)
    }
  }

  summ_model <- summ_model[1:(cont - 1), ]

  if (verbose == TRUE) {
    dfresult <- data.frame(
      model = summ_model$model,
      accuracy = summ_model$accuracy,
      kappa = summ_model$Kappa,
      time = summ_model$time_run
    ) %>%
      print() %>%
      tidyr::gather(key = var, value = value, -model, -time)
    g1 <- ggplot(dfresult, aes(x = model, y = value, fill = model)) +
      geom_col() +
      geom_text(aes(label = round(value, 3)), size = 2.5, vjust = 1.5) +
      theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none"
      ) +
      facet_wrap(~ var, scales = "free")
    print(g1)
  }
  return(summ_model)
}






zero_range <- function(x, tol = 1000 * .Machine$double.eps) {
  if (length(x) == 1) {
    return(TRUE)
  }
  if (length(x) != 2) {
    stop("x must be length 1 or 2")
  }
  if (any(is.na(x))) {
    return(NA)
  }
  if (x[1] == x[2]) {
    return(TRUE)
  }
  if (all(is.infinite(x))) {
    return(FALSE)
  }
  m <- min(abs(x))
  if (m == 0) {
    return(FALSE)
  }
  abs( (x[1] - x[2]) / m) < tol
}

rescale <- function(x, to = c(0, 1),
                    from = range(x, na.rm = TRUE, finite = TRUE)) {
  if (zero_range(from) || zero_range(to)) {
    return(ifelse(is.na(x), NA, mean(to)))
  }
  (x - from[1]) / diff(from) * diff(to) + to[1]
}



#' Plot predict observed and residual
#'
#' @title Plot predict observed and residual
#' @description This function plots predict,  observed values and residual
#' @param result  return of function rum_model_performance
#' @param residual logic value indicates if plot or not residuals
#' @details details
#' @importFrom dplyr tibble
#' @importFrom ggplot2 ggplot  geom_segment geom_point geom_abline facet_wrap
#' @importFrom ggplot2 ggtitle aes theme_bw scale_color_continuous guides xlim ylim
#' @importFrom dplyr '%>%'
#' @examples
#' \dontrun{
#' plot_predict_observed_residual(result, TRUE)
#' }
#' @export
#'
plot_predict_observed_residual <- function(result, residual = FALSE) {
  vobs <- vpred <- residuo <- NULL
  nr <- nrow(result)
  i <- 1
  for (i in 1:nr) {
    ddd <- result$dfpredobs[[i]]
    ddd$model <- result$model[i]
    if (i == 1) {
      dresult <- ddd
    } else {
      dresult <- rbind(dresult, ddd)
    }
  }
  if (residual == TRUE) {
    g1 <- dresult %>%
      ggplot(aes(y = vobs, x = vpred)) +
      geom_segment(aes(
        y = vpred, yend = vobs,
        x = vpred, xend = vpred
      )) +
      xlim(range(c(ddd$vpred, ddd$vobs))) +
      ylim(range(c(ddd$vpred, ddd$vobs))) +
      geom_abline(slope = 1, intercept = 0) +
      geom_point(aes(y = vobs), shape = 1) +
      geom_point(aes(color = abs(residuo))) +
      scale_color_continuous(low = "green", high = "red") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      guides(color = FALSE) +
      facet_wrap(~model, scales = "free") + theme_bw()
  } else {
    g1 <- dresult %>%
      ggplot(aes(y = vobs, x = vpred)) +
      geom_abline(slope = 1, intercept = 0) +
      geom_point(aes(y = vobs), shape = 1) +
      geom_point(aes(color = abs(residuo))) +
      scale_color_continuous(low = "green", high = "red") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      guides(color = FALSE) +
      facet_wrap(~model, scales = "free") + theme_bw()
  }

  return(g1)
}




#' Plot Confusion Matrix
#'
#' @title  Plot Confusion Matrix
#' @description This function  Plot Confusion Matrix
#' @param obs  observed values
#' @param pred predicted values
#' @importFrom ggplot2 ggplot geom_tile geom_text scale_fill_gradient theme
#' @importFrom ggplot2 ggtitle element_text aes
#' @importFrom tidyr gather
#' @importFrom caret confusionMatrix
#' @importFrom tibble rownames_to_column
#' @importFrom knitr kable
#' @importFrom kableExtra kable_styling scroll_box
#' @export
plot_confusion_matrix <- function(obs, pred) {
  actual <- prediction <- freq <- NULL
  g1 <- caret::confusionMatrix(pred, obs)$table %>%
    prop.table(margin = 1) %>%
    as.data.frame.matrix() %>%
    tibble::rownames_to_column(var = "actual") %>%
    tidyr::gather(key = "prediction", value = "freq", -actual) %>%
    ggplot2::ggplot(ggplot2::aes(x = actual, y = prediction, fill = freq)) +
    ggplot2::geom_tile() +
    ggplot2::geom_text(
      ggplot2::aes(label = round(freq, 2)), size = 3,
      color = "gray20"
    ) +
    ggplot2::scale_fill_gradient(
      low = "yellow", high = "red", limits = c(0, 1),
      name = "Relative Frequency"
    ) +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
    ggplot2::ggtitle("Confusion Matrix")
  print(g1)

  kp <- caret::confusionMatrix(pred, obs)$table %>%
    as.data.frame.matrix() %>%
    knitr::kable() %>%
    kableExtra::kable_styling(bootstrap_options = c("striped"), font_size = 8)

  print(" ")
  print("----------------------------------------------------------")
  print("confusion matrix")
  print(kp)
  ddd <- caret::confusionMatrix(pred, obs)$overall
  dddf <- data.frame(names(ddd), ddd)
  rownames(dddf) <- NULL
  names(dddf) <- c("metric", "value")
  print(" ")
  print("----------------------------------------------------------")
  print("metrics")
  print(knitr::kable(dddf))

  ddd <- data.frame( (caret::confusionMatrix(pred, obs))$byClass)
  ddd <- data.frame(labgeo::abbrev_colnames(ddd, 16))
  print(" ")
  print("----------------------------------------------------------")
  print("metrics")
  print(knitr::kable(ddd[, 1:4]))
  print(knitr::kable(ddd[, 5:8]))
  print(knitr::kable(ddd[, 9:11]))
}


rmp_regressao <- function(fit_run_model, df_valida, verbose = FALSE) {
  vpred <- vobs <- model <- mbe <- mae <- rmse <- nse <-
    r2 <- var_exp <- var <- value <- NULL
  nm <- length(fit_run_model)
  summ_model <- dplyr::tibble(
    model =  character(nm), r2 = numeric(nm), rmse = numeric(nm),
    mbe =  numeric(nm), mae = numeric(nm), nse = numeric(nm),
    var_exp =  numeric(nm), time = numeric(nm),
    fit =  list(nm), dfpredobs = list(nm)
  )

  cont <- 1
  for (i in seq_along(fit_run_model)) {
    fit_md <- fit_run_model[[i]]
    if (is.null(fit_md) == FALSE) {
      v <- tryCatch({
          predict(fit_md, df_valida[, -1])
        },
        error = function(e) {
          print(paste("Error:", conditionMessage(e)))
          return(NULL)
        }
      )
      if (is.null(v) == FALSE) {
        v <- unlist(v)
        nr <- nrow(df_valida)
        ddd <- data.frame(
          model = rep(fit_md$method, nr),
          vobs = df_valida[, 1], vpred = v,
          residuo = abs(v - df_valida[, 1]),
          stringsAsFactors = FALSE
        ) %>% na.omit()
        names(ddd)[2] <- "vobs"
        names(ddd)[3] <- "vpred"
        names(ddd)[4] <- "residuo"
        summ_model$model[cont] <- fit_md$method
        summ_model$fit[cont] <- list(fit_md)
        summ_model$dfpredobs[cont] <- list(ddd)
        acc <- pred_acc(ddd$vpred, ddd$vobs)
        summ_model$r2[cont] <- acc$rsquared
        summ_model$rmse[cont] <- acc$root_mean_square_error
        summ_model$mbe[cont] <- acc$mean_bias_error
        summ_model$mae[cont] <- acc$mean_absolute_error
        summ_model$nse[cont] <- acc$Nash_Sutcliffe_efficiency
        summ_model$var_exp[cont] <- acc$variance_explained_perc
        summ_model$time[cont] <- fit_md$times$everything[3]
        cont <- cont + 1
      }
    }
  }
  summ_model <- summ_model[1:(cont - 1), ]
  if (verbose == TRUE) {
    dgr <- summ_model %>%
      select(model, mbe, mae, rmse, nse, r2, var_exp) %>%
      na.omit() %>%
      tidyr::gather(key = var, value = value, -model)

    g1 <- ggplot2::ggplot(dgr, aes(x = model, y = value, fill = model)) +
      ggplot2::geom_col() +
      ggplot2::geom_text(
        aes(label = round(value, 3)),
        size = 2.5, vjust = 1.5
      ) +
      ggplot2::facet_wrap(~var, scales = "free") +
      ggplot2::theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none"
      ) +
      ggplot2::ggtitle(model)
    print(g1)
  }
  return(summ_model)
}


#' @title  fit null model
#' @description calculate RMSE of null model
#' @param train_y vector with train outcome variable
#' @param test_y  vector with test outcome variable
#' @importFrom caret postResample
#' @examples
#' \dontrun{
#' null_model(train_y = train$Ozone, test_y = test$Ozone)
#' }
#' @export
null_model <- function(train_y, test_y) {
  media <- mean(train_y)
  vr <- caret::postResample(rep(media, length(test_y)), test_y)
  return(RMSE = vr[1])
}
elpidiofilho/labgeo documentation built on May 14, 2019, 9:35 a.m.