rebecka_package: Utility functions used in the REBECKA project

Documented in fit_linear_model_groups fit_model

#' Linear model fitting on a dataframe
#'
#' This function allows you to fit a (multiple) linear model between
#' dependend and independend variables and perform cross validation.
#' DEPRECATED USE FIT_MODEL2 INSTEAD!!!!
#' @param df data.frame, A data.frame that is used to fit a linear model between
#' dependent and independent variables.
#' @param frml formula, A formula object for building the linear model
#' @param family string, 'lm' or 'gam' for linear model or generalized additive model, respectively.
#' @param plots Should scatterplots of the form plot(frml, data = df)
#' be added to the output?
#' @param verbose logical, if True the R2 of every model fit will be printed.
#' @param ... further arguments passed on to caret::trainControl (method, number, repeats).
#' Defaults correspond to repeatedcv using 5 folds repeated for 3 times. method = the
#' resampling method: "boot", "boot632", "optimism_boot", "boot_all", "cv", "repeatedcv",
#' "LOOCV", "LGOCV" (for repeated training/test splits), "none"
#' (only fits one model to the entire training set), "oob"
#' (only for random forest, bagged trees, bagged earth, bagged flexible
#' discriminant analysis, or conditional tree forest models), timeslice,
#' "adaptive_cv", "adaptive_boot" or "adaptive_LGOCV"
#' @keywords bioclimatic index, linear model
#' @export
#' @examples
#' fit_model(df = df.interp,
#'   frml = huglin ~ elevation,
#'   output = 'full',
#'   plots = T)

fit_model <- function(df, frml, family = 'lm', subset = 'full', preProc = '',
                      verbose = F, force_in = '', gam_grid = NULL, cubist_grid = NULL,
                      rf_grid = NULL, svmRadial_grid = NULL, ...) {

  fit_caret <- function() {

    params <- switch (family,
                      'lm' = list('lm', NULL, caret::lmFuncs),
                      'gam' = list('gam', gam_grid, caret::gamFuncs),
                      'cubist' = list('cubist', cubist_grid),
                      'rf' = list('rf', rf_grid, caret::rfFuncs),
                      'svmRadial' = list('svmRadial', svmRadial_grid)
    )

    pred <- all.vars(frml)[1]
    vars <- all.vars(frml)[-1]

    df <- df %>% dplyr::select(dplyr::matches(paste(all.vars(frml), collapse = '|')))

    if (subset == 'best') {

      mae_min <- 1000000000
      model <- NULL

      for (i in 1:length(vars)) {

        combinations <- combn(vars, m = i, simplify = F)

        for (comb in combinations) {

          frml_temp <- as.formula(str_c(pred, ' ~ ', str_c(comb, collapse = ' + ')))

          model_temp <- caret::train(frml_temp,
                                     data = df,
                                     method = params[[1]],
                                     preProcess = preProc,
                                     trControl = ctrl,
                                     tuneGrid = params[[2]])

          mae <- model_temp[['results']][['MAE']]

          if (mae < mae_min) {
            model <- model_temp
            mae_min <- mae
          }
        }
      }
    } else if (subset == 'rfe') {

      if (params[[1]] == 'cubist') stop('Rfe not implemented for cubist regression.')
      if (params[[1]] == 'svmRadial') stop('Rfe not implemented for svmRadial regression.')

      ctrl_rfe <- caret::rfeControl(functions = params[[3]],
                                    method = 'boot')

      if (length(preProc) > 0) {

        preProcValues <- caret::preProcess(df, method = preProc)
        df_rfe <- predict(preProcValues, df)

      }

      model_rfe <- caret::rfe(x = dplyr::select(df_rfe, vars),
                              y = dplyr::pull(df_rfe, pred),
                              sizes = 1:length(vars),
                              rfeControl = ctrl_rfe)

      opt_vars <- model_rfe$optVariables
      if (length(force_in) > 0 & !force_in %in% opt_vars) opt_vars <- c(opt_vars, force_in)

      frml_rfe <- as.formula(str_c(pred, ' ~ ', str_c(opt_vars, collapse = ' + ')))

      model <- caret::train(frml_rfe,
                            data = df,
                            method = params[[1]],
                            preProcess = preProc,
                            trControl = ctrl,
                            tuneGrid = params[[2]])

    } else if (subset == 'full') {

      model <- caret::train(frml,
                            data = df,
                            method = params[[1]],
                            preProcess = preProc,
                            trControl = ctrl,
                            tuneGrid = params[[2]])

    }

    return(model)
  }


  if (!all(all.vars(frml) %in% colnames(df))) { stop('Not all frml elements found as columns in df') }
  if (length(preProc) == 0) preProc <- NULL

  #Set cross validation defaults if not supplied
  ctrl.p <- list(...)
  if (is.null(ctrl.p$method)) ctrl.p$method <- 'boot'
  if (is.null(ctrl.p$number)) ctrl.p$number <- 25
  if (is.null(ctrl.p$repeats)) ctrl.p$repeats <- NA

  ctrl <- caret::trainControl(method = ctrl.p$method,
                              number = ctrl.p$number,
                              repeats = ctrl.p$repeats,
                              verboseIter = F)

  time_start <- Sys.time()

  if (family == 'lm' & ctrl.p$method == 'none' & subset == 'full') {
    model.fit <- lm(frml, data = df)
    verbose_string <- paste('R2: ', round(summary(model.fit)$r.squared, 2) )
  } else {
    model.fit <- fit_caret()
    verbose_string <- paste('MAE: ', round(min(model.fit[['results']][['MAE']]), 2))
  }

  time_end <- Sys.time()
  time_elapsed <- round(time_end - time_start, 3)

  if (verbose) print(paste(family, 'model with', subset, 'subset', verbose_string, '   (', time_elapsed, 's )'))

  return(model.fit)

}

#' Linear model fitting on subgroups of data
#'
#' This function allows you to fit a (multiple) linear model between
#' dependend and independend variables and perform cross validation to variable
#' subgroups of the data.
#' @param df data.frame, A data.frame that is used to fit a linear model between dependent and
#' independent variables.
#' @param frml formula, A formula object for building the linear model
#' @param group.cols string, column names that are used to group df
#' @param rowname.col string, name of the column that is used to give rownames to
#' the nested dataframes within df. Only useful if the desired output is 'augment' as
#' the rownames are used to name the output rows.
#' @param min.size integer, minimal amount of observations that have to be
#' within a group in order to fit the linear model. If the group is smaller, it
#' will be discarded.
#' @param output string, one of 'full', 'minimal', 'augmented', 'glanced' or 'tidied'.
#' 'full' will give all columns, 'minimal' only the linear model fits. For further
#' information about the rest see the respective function in package::broom.
#' @param ... further arguments passed on to caret::trainControl (method, number, repeats).
#' Defaults correspond to repeated crossvalidation using 5 folds repeated for 3 times.
#' @keywords bioclimatic index, linear model
#' @export
#' @examples
#' fit_linear_model_groups(df = df.interp,
#'   frml = huglin ~ elevation,
#'   predictor.raster = dem.st,
#'   file.name = data/huglin.tif,
#'   set.zero = T)

fit_linear_model_groups <- function(df, frml,
                                    group.cols = NULL, rowname.col = NULL,
                                    min.size = 30, output = 'full', ...) {

  name_rows <- function(df, rowname.col) {

    rownames(df) <- dplyr::pull(df, rowname.col)
    return(df)

  }
  fit_caret <- function(df, frml, ctrl) {

    model <- caret::train(frml,
                          data = df,
                          method = "lm",
                          preProcess = c("center", "scale"),
                          trControl = ctrl)

    pb$tick()

    return(model)
  }

  #Set cross validation defaults if not supplied
  ctrl.p <- list(...)
  if (is.null(ctrl.p$method)) ctrl.p$method <- 'repeatedcv'
  if (is.null(ctrl.p$number)) ctrl.p$number <- 5
  if (is.null(ctrl.p$repeats)) ctrl.p$repeats <- 3

  ctrl = caret::trainControl(method = ctrl.p$method, number = ctrl.p$number,
                             repeats = ctrl.p$repeats, verboseIter = F)

  if (is.null(group.cols)) {
    df <- dplyr::mutate(df, group_col = 1)
    group.cols <- 'group_col'
  }

  group.cols <- rlang::syms(group.cols)

  df <- df %>%
    dplyr::add_count(!!!group.cols) %>%
    dplyr::filter_at(ncol(.), dplyr::all_vars(.>min.size))

  if (nrow(df) == 0) { stop('Group sizes too small. Choose different group col or adjust min.size parameter.') }

  df.nest <- tidyr::nest(df, -c(!!!group.cols))

  if (!is.null(rowname.col)) {
    df.nest <- dplyr::mutate(df.nest, data = purrr::map(data,
                                                        name_rows,
                                                        rowname.col = rowname.col))
  }

  #Set up progress bar
  ticks <- nrow(df.nest)
  print(paste0('Total amount of models to be fitted: ', ticks))

  pb <- progress::progress_bar$new(total = ticks,
                                   format = "[:bar] :percent eta: :eta")
  pb$tick(0)

  #Fit a model to every row of df.nest
  df.models <- df.nest %>%

    dplyr::mutate(caret.fit = purrr::map(data, fit_caret, frml = frml, ctrl = ctrl),
                  caret.results = purrr::map(caret.fit, 'results'),
                  lm.fit = purrr::map(data, lm, formula = frml),
                  glanced = purrr::map(lm.fit, broom::glance),
                  augmented = purrr::map(lm.fit, broom::augment),
                  tidied = purrr::map(lm.fit, broom::tidy)) %>%

    arrange(!!!group.cols)

  if ('group_col' %in% colnames(df.models)) { df.models <- dplyr::select(df.models, -c(!!!group.cols)) }

  if (!output %in% c('minimal', 'full', 'glanced', 'augmented', 'tidied')) {
    stop('Unknown output. Choose one of: full, glanced, augmented, tidied')
  }

  if (output == 'minimal') {

    if (nrow(df.models) == 1) {
      return(df.models$caret.fit[[1]])
    } else {
      return(df.models$caret.fit)
    }
  }

  if (output != 'full') {

    output <- rlang::syms(output)
    df.models <- tidyr::unnest(df.models, !!!output, .drop = T)

  }

  return(df.models)
}