#' Linear model fitting on a dataframe
#'
#' This function allows you to fit a (multiple) linear model between
#' dependend and independend variables and perform cross validation.
#' DEPRECATED USE FIT_MODEL2 INSTEAD!!!!
#' @param df data.frame, A data.frame that is used to fit a linear model between
#' dependent and independent variables.
#' @param frml formula, A formula object for building the linear model
#' @param family string, 'lm' or 'gam' for linear model or generalized additive model, respectively.
#' @param plots Should scatterplots of the form plot(frml, data = df)
#' be added to the output?
#' @param verbose logical, if True the R2 of every model fit will be printed.
#' @param ... further arguments passed on to caret::trainControl (method, number, repeats).
#' Defaults correspond to repeatedcv using 5 folds repeated for 3 times. method = the
#' resampling method: "boot", "boot632", "optimism_boot", "boot_all", "cv", "repeatedcv",
#' "LOOCV", "LGOCV" (for repeated training/test splits), "none"
#' (only fits one model to the entire training set), "oob"
#' (only for random forest, bagged trees, bagged earth, bagged flexible
#' discriminant analysis, or conditional tree forest models), timeslice,
#' "adaptive_cv", "adaptive_boot" or "adaptive_LGOCV"
#' @keywords bioclimatic index, linear model
#' @export
#' @examples
#' fit_model(df = df.interp,
#' frml = huglin ~ elevation,
#' output = 'full',
#' plots = T)
fit_model <- function(df, frml, family = 'lm', subset = 'full', preProc = '',
verbose = F, force_in = '', gam_grid = NULL, cubist_grid = NULL,
rf_grid = NULL, svmRadial_grid = NULL, ...) {
fit_caret <- function() {
params <- switch (family,
'lm' = list('lm', NULL, caret::lmFuncs),
'gam' = list('gam', gam_grid, caret::gamFuncs),
'cubist' = list('cubist', cubist_grid),
'rf' = list('rf', rf_grid, caret::rfFuncs),
'svmRadial' = list('svmRadial', svmRadial_grid)
)
pred <- all.vars(frml)[1]
vars <- all.vars(frml)[-1]
df <- df %>% dplyr::select(dplyr::matches(paste(all.vars(frml), collapse = '|')))
if (subset == 'best') {
mae_min <- 1000000000
model <- NULL
for (i in 1:length(vars)) {
combinations <- combn(vars, m = i, simplify = F)
for (comb in combinations) {
frml_temp <- as.formula(str_c(pred, ' ~ ', str_c(comb, collapse = ' + ')))
model_temp <- caret::train(frml_temp,
data = df,
method = params[[1]],
preProcess = preProc,
trControl = ctrl,
tuneGrid = params[[2]])
mae <- model_temp[['results']][['MAE']]
if (mae < mae_min) {
model <- model_temp
mae_min <- mae
}
}
}
} else if (subset == 'rfe') {
if (params[[1]] == 'cubist') stop('Rfe not implemented for cubist regression.')
if (params[[1]] == 'svmRadial') stop('Rfe not implemented for svmRadial regression.')
ctrl_rfe <- caret::rfeControl(functions = params[[3]],
method = 'boot')
if (length(preProc) > 0) {
preProcValues <- caret::preProcess(df, method = preProc)
df_rfe <- predict(preProcValues, df)
}
model_rfe <- caret::rfe(x = dplyr::select(df_rfe, vars),
y = dplyr::pull(df_rfe, pred),
sizes = 1:length(vars),
rfeControl = ctrl_rfe)
opt_vars <- model_rfe$optVariables
if (length(force_in) > 0 & !force_in %in% opt_vars) opt_vars <- c(opt_vars, force_in)
frml_rfe <- as.formula(str_c(pred, ' ~ ', str_c(opt_vars, collapse = ' + ')))
model <- caret::train(frml_rfe,
data = df,
method = params[[1]],
preProcess = preProc,
trControl = ctrl,
tuneGrid = params[[2]])
} else if (subset == 'full') {
model <- caret::train(frml,
data = df,
method = params[[1]],
preProcess = preProc,
trControl = ctrl,
tuneGrid = params[[2]])
}
return(model)
}
if (!all(all.vars(frml) %in% colnames(df))) { stop('Not all frml elements found as columns in df') }
if (length(preProc) == 0) preProc <- NULL
#Set cross validation defaults if not supplied
ctrl.p <- list(...)
if (is.null(ctrl.p$method)) ctrl.p$method <- 'boot'
if (is.null(ctrl.p$number)) ctrl.p$number <- 25
if (is.null(ctrl.p$repeats)) ctrl.p$repeats <- NA
ctrl <- caret::trainControl(method = ctrl.p$method,
number = ctrl.p$number,
repeats = ctrl.p$repeats,
verboseIter = F)
time_start <- Sys.time()
if (family == 'lm' & ctrl.p$method == 'none' & subset == 'full') {
model.fit <- lm(frml, data = df)
verbose_string <- paste('R2: ', round(summary(model.fit)$r.squared, 2) )
} else {
model.fit <- fit_caret()
verbose_string <- paste('MAE: ', round(min(model.fit[['results']][['MAE']]), 2))
}
time_end <- Sys.time()
time_elapsed <- round(time_end - time_start, 3)
if (verbose) print(paste(family, 'model with', subset, 'subset', verbose_string, ' (', time_elapsed, 's )'))
return(model.fit)
}
#' Linear model fitting on subgroups of data
#'
#' This function allows you to fit a (multiple) linear model between
#' dependend and independend variables and perform cross validation to variable
#' subgroups of the data.
#' @param df data.frame, A data.frame that is used to fit a linear model between dependent and
#' independent variables.
#' @param frml formula, A formula object for building the linear model
#' @param group.cols string, column names that are used to group df
#' @param rowname.col string, name of the column that is used to give rownames to
#' the nested dataframes within df. Only useful if the desired output is 'augment' as
#' the rownames are used to name the output rows.
#' @param min.size integer, minimal amount of observations that have to be
#' within a group in order to fit the linear model. If the group is smaller, it
#' will be discarded.
#' @param output string, one of 'full', 'minimal', 'augmented', 'glanced' or 'tidied'.
#' 'full' will give all columns, 'minimal' only the linear model fits. For further
#' information about the rest see the respective function in package::broom.
#' @param ... further arguments passed on to caret::trainControl (method, number, repeats).
#' Defaults correspond to repeated crossvalidation using 5 folds repeated for 3 times.
#' @keywords bioclimatic index, linear model
#' @export
#' @examples
#' fit_linear_model_groups(df = df.interp,
#' frml = huglin ~ elevation,
#' predictor.raster = dem.st,
#' file.name = data/huglin.tif,
#' set.zero = T)
fit_linear_model_groups <- function(df, frml,
group.cols = NULL, rowname.col = NULL,
min.size = 30, output = 'full', ...) {
name_rows <- function(df, rowname.col) {
rownames(df) <- dplyr::pull(df, rowname.col)
return(df)
}
fit_caret <- function(df, frml, ctrl) {
model <- caret::train(frml,
data = df,
method = "lm",
preProcess = c("center", "scale"),
trControl = ctrl)
pb$tick()
return(model)
}
#Set cross validation defaults if not supplied
ctrl.p <- list(...)
if (is.null(ctrl.p$method)) ctrl.p$method <- 'repeatedcv'
if (is.null(ctrl.p$number)) ctrl.p$number <- 5
if (is.null(ctrl.p$repeats)) ctrl.p$repeats <- 3
ctrl = caret::trainControl(method = ctrl.p$method, number = ctrl.p$number,
repeats = ctrl.p$repeats, verboseIter = F)
if (is.null(group.cols)) {
df <- dplyr::mutate(df, group_col = 1)
group.cols <- 'group_col'
}
group.cols <- rlang::syms(group.cols)
df <- df %>%
dplyr::add_count(!!!group.cols) %>%
dplyr::filter_at(ncol(.), dplyr::all_vars(.>min.size))
if (nrow(df) == 0) { stop('Group sizes too small. Choose different group col or adjust min.size parameter.') }
df.nest <- tidyr::nest(df, -c(!!!group.cols))
if (!is.null(rowname.col)) {
df.nest <- dplyr::mutate(df.nest, data = purrr::map(data,
name_rows,
rowname.col = rowname.col))
}
#Set up progress bar
ticks <- nrow(df.nest)
print(paste0('Total amount of models to be fitted: ', ticks))
pb <- progress::progress_bar$new(total = ticks,
format = "[:bar] :percent eta: :eta")
pb$tick(0)
#Fit a model to every row of df.nest
df.models <- df.nest %>%
dplyr::mutate(caret.fit = purrr::map(data, fit_caret, frml = frml, ctrl = ctrl),
caret.results = purrr::map(caret.fit, 'results'),
lm.fit = purrr::map(data, lm, formula = frml),
glanced = purrr::map(lm.fit, broom::glance),
augmented = purrr::map(lm.fit, broom::augment),
tidied = purrr::map(lm.fit, broom::tidy)) %>%
arrange(!!!group.cols)
if ('group_col' %in% colnames(df.models)) { df.models <- dplyr::select(df.models, -c(!!!group.cols)) }
if (!output %in% c('minimal', 'full', 'glanced', 'augmented', 'tidied')) {
stop('Unknown output. Choose one of: full, glanced, augmented, tidied')
}
if (output == 'minimal') {
if (nrow(df.models) == 1) {
return(df.models$caret.fit[[1]])
} else {
return(df.models$caret.fit)
}
}
if (output != 'full') {
output <- rlang::syms(output)
df.models <- tidyr::unnest(df.models, !!!output, .drop = T)
}
return(df.models)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.