# Calculates permutation importance for binomial (including logistic) regression.
calc_permutation_importance_binomial <- function(fit, target, vars, data) {
var_list <- as.list(vars)
importances <- purrr::map(var_list, function(var) {
mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
predict.fun = function(object,newdata){predict(object,type = "response",newdata=newdata)},
# Use minus log likelyhood as loss function, since it is what logistic regression tried to optimise.
loss.fun = function(x,y){-sum(log(1- abs(x - y)),na.rm = TRUE)})
})
importances <- purrr::flatten_dbl(importances)
importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
importances_df
}
# Calculates permutation importance for linear regression.
calc_permutation_importance_linear <- function(fit, target, vars, data) {
var_list <- as.list(vars)
importances <- purrr::map(var_list, function(var) {
mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
# predict.fun can be the default function, which is predict(object, newdata=newdata).
# For some reason, default loss.fun, which is mean((x - y)^2) returns NA, even with na.rm=TRUE. Rewrote it with sum() to avoid the issue.
loss.fun = function(x,y){sum((x - y)^2, na.rm = TRUE)/length(x)})
})
importances <- purrr::flatten_dbl(importances)
importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
importances_df
}
# Calculates permutation importance for GLM Gaussian regression.
calc_permutation_importance_gaussian <- function(fit, target, vars, data) {
var_list <- as.list(vars)
importances <- purrr::map(var_list, function(var) {
mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
# type needs to be "response" so that x in loss.fun can be used to calculate sum of squares with any choice of link function.
predict.fun = function(object,newdata){predict(object,type = "response",newdata=newdata)},
# For some reason, default loss.fun, which is mean((x - y)^2) returns NA, even with na.rm=TRUE. Rewrote it with sum() to avoid the issue.
loss.fun = function(x,y){sum((x - y)^2, na.rm = TRUE)/length(x)})
})
importances <- purrr::flatten_dbl(importances)
importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
importances_df
}
# Calculates permutation importance for logistic regression.
calc_permutation_importance_poisson <- function(fit, target, vars, data) {
var_list <- as.list(vars)
importances <- purrr::map(var_list, function(var) {
mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
# type needs to be "link" since x in loss.fun expects linear predictor.
predict.fun = function(object,newdata){predict(object,type = "link",newdata=newdata)},
# Use minus log likelyhood as loss function.
# Reference: https://en.wikipedia.org/wiki/Poisson_regression#Maximum_likelihood-based_parameter_estimation
# https://stats.stackexchange.com/questions/70054/how-is-it-possible-that-poisson-glm-accepts-non-integer-numbers
loss.fun = function(x,y){-sum(y*x-exp(x), na.rm = TRUE)})
})
importances <- purrr::flatten_dbl(importances)
importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
importances_df
}
# TODO: Make this function model-agnostic and consolidate. There are similar code for lm/glm, ranger, rpart, and xgboost.
# Builds partial_dependency object for lm/glm with same structure (a data.frame with attributes.) as edarf::partial_dependence.
partial_dependence.lm_exploratory <- function(fit, target, vars = colnames(data),
n = c(min(nrow(unique(data[, vars, drop = FALSE])), 25L), nrow(data)), # Keeping same default of 25 as edarf::partial_dependence, although we usually overwrite from callers.
interaction = FALSE, uniform = TRUE, data, ...) {
predict.fun <- function(object, newdata) {
predict(object, newdata = newdata, type = "response")
}
aggregate.fun <- function(x) {
c("preds" = mean(x))
}
# Generate grid points based on quantile, so that FIRM calculated based on it would make good sense even when there are some outliers.
points <- list()
quantile_points <- list()
for (cname in vars) {
if (is.numeric(data[[cname]])) {
coldata <- data[[cname]]
minv <- min(coldata, na.rm=TRUE)
maxv <- max(coldata, na.rm=TRUE)
grid <- minv + (0:20)/20 * (maxv - minv)
quantile_grid <- quantile(coldata, probs=1:24/25)
quantile_points[[cname]] <- quantile_grid
points[[cname]] <- sort(unique(c(grid, quantile_grid)))
}
else {
points[[cname]] <- unique(data[[cname]])
}
}
args = list(
"data" = data,
"vars" = vars,
"n" = n,
"model" = fit,
"points" = points,
"predict.fun" = predict.fun,
"aggregate.fun" = aggregate.fun,
...
)
if (length(vars) > 1L & !interaction) { # More than one variables are there. Iterate calling mmpf::marginalPrediction.
pd = rbindlist(sapply(vars, function(x) {
args$vars = x
if ("points" %in% names(args))
args$points = args$points[x]
mp = do.call(mmpf::marginalPrediction, args)
names(mp)[ncol(mp)] = target
mp
}, simplify = FALSE), fill = TRUE)
data.table::setcolorder(pd, c(vars, colnames(pd)[!colnames(pd) %in% vars]))
} else {
# Remove value label from vars to avoid unexpected column name in the result.
args$vars = as.character(vars)
pd = do.call(mmpf::marginalPrediction, args)
names(pd)[ncol(pd)] = target
}
attr(pd, "class") = c("pd", "data.frame")
attr(pd, "interaction") = interaction == TRUE
attr(pd, "target") = target
attr(pd, "vars") = vars
attr(pd, "points") = points
attr(pd, "quantile_points") = quantile_points
pd
}
# Calculate average marginal effects from model with margins package.
calc_average_marginal_effects <- function(model, data=NULL, with_confint=FALSE) {
if (with_confint) {
if (!is.null(data)) {
m <- margins::margins(model, data=data)
}
else {
m <- margins::margins(model)
}
ret <- as.data.frame(summary(m))
ret <- ret %>% dplyr::rename(term=factor, ame=AME, ame_low=lower, ame_high=upper) %>%
dplyr::select(term, ame, ame_low, ame_high) #TODO: look into SE, z, p too.
ret
}
else {
# Fast versin that only calls margins::margins().
# margins::margins() does a lot more than margins::marginal_effects(),
# and takes about 10 times more time.
if (!is.null(data)) {
me <- margins::marginal_effects(model, data=data)
}
else {
me <- margins::marginal_effects(model)
}
# For some reason, str_replace garbles column names generated from Date column with Japanese name. Using gsub instead to avoid the issue.
# term <- stringr::str_replace(names(me), "^dydx_", "")
term <- gsub("^dydx_", "", names(me))
ame <- purrr::flatten_dbl(purrr::map(me, function(x){mean(x, na.rm=TRUE)}))
ret <- data.frame(term=term, ame=ame)
ret
}
}
# VIF calculation definition from car::vif.
# Copied to avoid having to import many dependency packages.
vif <- function(mod, ...) {
if (any(is.na(coef(mod))))
stop ("there are aliased coefficients in the model")
v <- vcov(mod)
assign <- attr(model.matrix(mod), "assign")
if (names(coefficients(mod)[1]) == "(Intercept)") {
v <- v[-1, -1]
assign <- assign[-1]
}
else warning("No intercept: vifs may not be sensible.")
terms <- labels(terms(mod))
n.terms <- length(terms)
if (n.terms < 2) stop("model contains fewer than 2 terms")
R <- cov2cor(v)
detR <- det(R)
result <- matrix(0, n.terms, 3)
rownames(result) <- terms
colnames(result) <- c("GVIF", "Df", "GVIF^(1/(2*Df))")
for (term in 1:n.terms) {
subs <- which(assign == term)
result[term, 1] <- det(as.matrix(R[subs, subs])) *
det(as.matrix(R[-subs, -subs])) / detR
result[term, 2] <- length(subs)
}
if (all(result[, 2] == 1)) result <- result[, 1]
else result[, 3] <- result[, 1]^(1/(2 * result[, 2]))
result
}
# Calculate VIF and throw user friendly message in case of perfect collinearity.
calc_vif <- function(model, terms_mapping) {
tryCatch({
vif(model)
}, error = function(e){
# in case of perfect multicollinearity, vif throws error with message "there are aliased coefficients in the model".
# Check if it is the case. If coef() includes NA, corresponding variable is causing perfect multicollinearity.
coef_vec <- coef(model)
na_coef_vec <- coef_vec[is.na(coef_vec)]
if (length(na_coef_vec) > 0) {
na_coef_names <- names(na_coef_vec)
na_coef_names <- map_terms_to_orig(na_coef_names, terms_mapping) # Map column names in the message back to original.
message <- paste(na_coef_names, collapse = ", ")
message <- paste0("Variables causing perfect collinearity : ", message)
e$message <- message
}
stop(e)
})
}
# augment function just to filter out unknown categories in predictors to avoid error.
augment.lm_exploratory_0 <- function(x, data = NULL, newdata = NULL, ...) {
if (!is.null(newdata) && length(x$xlevels) > 0) {
for (i in 1:length(x$xlevels)) {
newdata <- newdata %>% dplyr::filter(!!rlang::sym(names(x$xlevels)[[i]]) %in% !!x$xlevels[[i]])
}
}
if (is.null(data)) { # Giving data argument when it is NULL causes error from augment.glm. Do the same for lm just in case.
ret <- broom:::augment.lm(x, newdata = newdata, se=TRUE, ...)
if (!is.null(ret$.rownames)) { # Clean up .rownames column augment.glm adds for some reason. Do the same for lm just in case.
ret$.rownames <- NULL
}
}
else {
ret <- broom:::augment.lm(x, data = data, newdata = newdata, se=TRUE, ...)
}
ret
}
tidy.lm_exploratory_0 <- function(x, ...) {
# tidy.lm raises error when class has more than "lm". Working it around here.
orig_class <- class(x)
class(x) <- "lm"
ret <- broom:::tidy.lm(x, ...)
class(x) <- orig_class
ret
}
#' lm wrapper with do
#' @return deta frame which has lm model
#' @param data Data frame to be used as data
#' @param formula Formula for lm
#' @param ... Parameters to be passed to lm function
#' @param keep.source Whether source should be kept in source.data column
#' @param augment Whether the result should be augmented immediately
#' @param group_cols A vector with columns names to be used as group columns
#' @param test_rate Ratio of test data
#' @param seed Random seed to control test data sampling
#' @export
build_lm <- function(data, formula, ..., keep.source = TRUE, augment = FALSE, group_cols = NULL, test_rate = 0.0, seed = 1){
validate_empty_data(data)
# make variables factor sorted by the frequency
fml_vars <- all.vars(formula)
for(var in fml_vars) {
if(is.character(data[[var]])){
data[[var]] <- forcats::fct_infreq(data[[var]])
}
}
if(!is.null(seed)){
set.seed(seed)
}
# deal with group columns by index because those names might be changed
group_col_index <- colnames(data) %in% group_cols
# change column names to avoid name conflict when tidy or glance are executed
reserved_names <- c(
"model", ".test_index", "data", ".model_metadata",
# for tidy
"term", "estimate", "std.error", "statistic", "p.value",
# for glance
"r.squared", "adj.r.squared", "sigma", "statistic", "p.value",
"df", "logLik", "AIC", "BIC", "deviance", "df.residual"
)
if(test_rate < 0 | 1 < test_rate){
stop("test_rate must be between 0 and 1")
} else if (test_rate == 1){
stop("test_rate must be less than 1")
}
colnames(data)[group_col_index] <- avoid_conflict(
reserved_names,
colnames(data)[group_col_index],
".group"
)
# make column names unique
colnames(data) <- make.unique(colnames(data), sep = "")
if(!is.null(group_cols)){
data <- dplyr::group_by(data, !!!rlang::syms(colnames(data)[group_col_index]))
}
# Filter out NA and Inf from target variable.
target_cols <- all.vars(lazyeval::f_lhs(formula))
for (target_col in target_cols) {
data <- data %>%
dplyr::filter(!is.na(!!rlang::sym(target_col)) & !is.infinite(!!rlang::sym(target_col)))
}
group_col_names <- grouped_by(data)
# check if grouping columns are in the formula
grouped_var <- group_col_names[group_col_names %in% fml_vars]
if (length(grouped_var) == 1) {
stop(paste0(grouped_var, " is a grouping column. Please remove it from variables."))
} else if (length(grouped_var) > 0) {
stop(paste0(paste(grouped_var, collapse = ", "), " are grouping columns. Please remove them from variables."))
}
model_col <- "model"
source_col <- "source.data"
caller <- match.call()
# this expands dots arguemtns to character
arg_char <- expand_args(caller, exclude = c("data", "keep.source", "augment", "group_cols", "test_rate", "seed"))
ret <- tryCatch({
ret <- data %>%
tidyr::nest(source.data=-dplyr::group_cols()) %>%
# create test index
dplyr::mutate(.test_index = purrr::map(source.data, function(df){
sample_df_index(df, rate = test_rate)
})) %>%
# slice training data
dplyr::mutate(model = purrr::map2(source.data, .test_index, function(df, index){
data <- safe_slice(df, index, remove = TRUE)
# execute lm with parsed arguments
model <- eval(parse(text = paste0("stats::lm(data = data, ", arg_char, ")")))
# Strip environments to save rds size when cached.
if (!is.null(model$terms)) {
attr(model$terms,".Environment")<-NULL
}
if (!is.null(model$model) && !is.null(attr(model$model,"terms"))) {
attr(attr(model$model,"terms"),".Environment") <- NULL
}
class(model) <- c("lm_exploratory_0", class(model))
model
})) %>%
dplyr::mutate(.model_metadata = purrr::map(source.data, function(df){
if(!is.null(formula)){
create_model_meta(df, formula)
} else {
list()
}
}))
if(!keep.source & !augment){
ret <- dplyr::select(ret, -source.data)
} else {
class(ret[[source_col]]) <- c("list", ".source.data")
}
ret <- dplyr::rowwise(ret)
ret
}, error = function(e){
# Error message was changed when upgrading dplyr to 0.7.1
# so use stringr::str_detect to make these robust.
# With dplyr 1.0.8, it seems that the message is separated into e$message and e$parant$message.
# With dplyr 1.0.10, it seems that the message is further separated into e$message, e$parant$message, and e$parent$parent$message.
# First extract the root message text.
if (!is.null(e$parent)) {
if (!is.null(e$parent$parent)) {
message <- e$parent$parent$message
}
else {
message <- e$parent$message
}
}
else { # Handling for before dplyr 1.0.8. (With 6.9.5 we do not bundle dplyr 1.0.8 yet.)
message <- e$message
}
# Run text match on the extracted message.
if (stringr::str_detect(message, "contrasts can be applied only to factors with 2 or more levels")) {
stop("more than 1 unique values are expected for categorical columns assigned as predictors")
}
if(stringr::str_detect(message, "0 \\(non\\-NA\\) cases")){
stop("no data after removing NA")
}
stop(message)
})
if(augment){
if(test_rate == 0){
ret <- prediction(ret, data = "training")
} else {
ret <- prediction(ret, data = "test")
}
} else {
class(ret[[model_col]]) <- c("list", ".model", ".model.lm")
}
ret
}
# Map terms back to the original variable names.
# If term is for categorical variable, e.g. c1_UA,
# it will be mapped to name like "Carrier: UA".
map_terms_to_orig <- function(terms, mapping) {
# Extract name part and value part of the coefficient name separately.
var_names <- stringr::str_extract(terms, "^c[0-9]+_")
var_values <- stringr::str_remove(terms, "^c[0-9]+_")
var_names_orig <- mapping[var_names] # Map variable names back to the original.
# is.na(var_names) is for "(Intercept)".
# var_values == "" is for numerical variables.
# Categorical variables are expected to have var_values.
ret <- dplyr::if_else(is.na(var_names), terms, dplyr::if_else(var_values == "", var_names_orig, paste0(var_names_orig, ": ", var_values)))
ret
}
#' Common preprocessing of regression data to be done BEFORE sampling.
#' Only common operations to be done, for example, in Summary View too.
#' @export
preprocess_regression_data_before_sample <- function(df, target_col, predictor_cols, filter_predictor_numeric_na=TRUE) {
# ranger and rpart works with NA or Inf in the target, but we decided it would build rather meaningless or biased model.
# For example, rpart binary classification just replaces NAs with FALSE, which would change the meaning of data inadvertently.
# lm will filter out NAs anyway internally, and errors out if the remaining rows are with single value in any predictor column.
# Also, filter Inf/-Inf too to avoid error at lm.
# So, we always filter out NA/Inf from target variable.
# NOTE: This has to be done before removing of all-NA predictor columns, since this operation might make new all-NA predictor columns.
df <- df %>%
dplyr::filter(!is.na(!!rlang::sym(target_col)) & !is.infinite(!!rlang::sym(target_col))) # this form does not handle group_by. so moved into each_func from outside.
# Remove all-NA-or-Inf columns.
# NOTE: This has to be done bofore filtering predictor numeric NAs. Otherwise, all the rows could be filtered out.
cols <- predictor_cols
for (col in predictor_cols) {
if(all(is.na(df[[col]]) | is.infinite(df[[col]]))){
# remove columns if they are all NA or Inf
cols <- setdiff(cols, col)
df[[col]] <- NULL # drop the column so that SMOTE will not see it.
}
}
if (length(cols) == 0) {
stop("No predictor column is left after removing columns with only NA or Inf values.")
}
# To avoid unused factor level that causes margins::marginal_effects() to fail, filtering operation has
# to be done before factor level adjustments.
# This is done before sampling so that we will end up with more valid rows in the end.
for(col in cols){
if(is.numeric(df[[col]]) || lubridate::is.Date(df[[col]]) || lubridate::is.POSIXct(df[[col]])) {
# For numeric cols, filter NA rows, because lm will anyway do this internally, and errors out
# if the remaining rows are with single value in any predictor column.
# Filter Inf/-Inf too to avoid error at lm.
# Do the same for Date/POSIXct, because we will create numeric columns from them.
# For rpart, filter NAs for numeric columns to avoid instability. It seems that the resulting tree from rpart sometimes becomes
# simplistic (e.g. only one split in the tree), especially in Exploratory for some reason, if we let rpart handle the handling of NAs,
# even though it is supposed to just filter out rows with NAs, which is same as what we are doing here.
if (filter_predictor_numeric_na) {
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(col)) & !is.infinite(!!rlang::sym(col)))
}
else {
# For ranger, removing numeric NA is not necessary.
# But even for ranger, filter Inf/-Inf to avoid following error from ranger.
# Error in seq.default(min(x, na.rm = TRUE), max(x, na.rm = TRUE), length.out = length.out) : 'from' must be a finite number
# TODO: In exp_rpart and calc_feature_imp, we have logic to remember and restore NA rows, but they are probably not made use of
# if we filter NA rows here.
df <- df %>% dplyr::filter(!is.na(!!rlang::sym(col)))
}
}
}
if (nrow(df) == 0) {
stop("No row is left after removing NA/Inf from numeric, Date, or POSIXct columns.")
}
attr(df, 'predictors') <- cols
df
}
#' Common preprocessing of regression data to be done AFTER sampling.
#' Only common operations to be done, for example, in Summary View too.
#' @export
#' @param df
#' @param target_col
#' @param predictor_cols
#' @param predictor_n
#' @param name_map
#' @param other_level - Level for "Other" group. Default is "Other".
preprocess_regression_data_after_sample <- function(df, target_col, predictor_cols,
predictor_n = 12, # so that at least months can fit in it.
name_map = NULL,
other_level = "Other") {
c_cols <- predictor_cols
for(col in predictor_cols){
if(lubridate::is.Date(df[[col]]) || lubridate::is.POSIXct(df[[col]])) {
c_cols <- setdiff(c_cols, col)
absolute_time_col <- avoid_conflict(colnames(df), paste0(col, "_abs_time"))
wday_col <- avoid_conflict(colnames(df), paste0(col, "_w_"))
day_col <- avoid_conflict(colnames(df), paste0(col, "_day_of_month"))
yday_col <- avoid_conflict(colnames(df), paste0(col, "_day_of_year"))
month_col <- avoid_conflict(colnames(df), paste0(col, "_m_"))
year_col <- avoid_conflict(colnames(df), paste0(col, "_year"))
new_name <- c(absolute_time_col, wday_col, day_col, yday_col, month_col, year_col)
names(new_name) <- paste(
names(name_map)[name_map == col],
c(
"_abs_time",
"_w_",
"_day_of_month",
"_day_of_year",
"_m_",
"_year"
), sep="")
name_map <- c(name_map, new_name)
c_cols <- c(c_cols, absolute_time_col, wday_col, day_col, yday_col, month_col, year_col)
df[[absolute_time_col]] <- as.numeric(df[[col]])
# turn it into unordered factor since if it is ordered factor,
# lm/glm takes polynomial terms (Linear, Quadratic, Cubic, and so on) and use them as variables,
# which we do not want for this function.
# Reference: https://hlplab.wordpress.com/2008/01/28/the-mysterious-l-q-and-c/
df[[wday_col]] <- factor(lubridate::wday(df[[col]], label=TRUE), ordered=FALSE)
# lubridate::day returns integer.
# Convert integer to numeric. mmpf::marginalPrediction we use for partial dependence throws assertion error, if the data is integer and specified grid points are not integer.
df[[day_col]] <- as.numeric(lubridate::day(df[[col]]))
df[[yday_col]] <- lubridate::yday(df[[col]])
# turn it into unordered factor for the same reason as wday.
df[[month_col]] <- factor(lubridate::month(df[[col]], label=TRUE), ordered=FALSE)
df[[year_col]] <- lubridate::year(df[[col]])
if(lubridate::is.POSIXct(df[[col]])) {
hour_col <- avoid_conflict(colnames(df), paste0(col, "_hour"))
new_name <- c(hour_col)
names(new_name) <- paste(
names(name_map)[name_map == col],
c(
"_hour"
), sep="")
name_map <- c(name_map, new_name)
c_cols <- c(c_cols, hour_col)
df[[hour_col]] <- factor(lubridate::hour(df[[col]])) # treat hour as category
}
df[[col]] <- NULL # drop original Date/POSIXct column to pass SMOTE later.
} else if(is.factor(df[[col]])) {
df[[col]] <- forcats::fct_drop(df[[col]]) # margins::marginal_effects() fails if unused factor level exists. Drop them to avoid it.
# 1. if the data is factor, respect the levels and keep first 10 levels, and make others "Others" level.
# 2. if the data is ordered factor, turn it into unordered. For ordered factor,
# lm/glm takes polynomial terms (Linear, Quadratic, Cubic, and so on) and use them as variables,
# which we do not want for this function.
if (length(levels(df[[col]])) > predictor_n + 1) { # +1 is for 'Others' group.
df[[col]] <- forcats::fct_other(factor(df[[col]], ordered=FALSE), keep=levels(df[[col]])[1:predictor_n], other_level=other_level)
}
else {
df[[col]] <- factor(df[[col]], ordered=FALSE)
}
# turn NA into (Missing) factor level. Without this, lm or glm drops rows internally.
df[[col]] <- forcats::fct_explicit_na(df[[col]])
} else if(is.logical(df[[col]])) {
# 1. convert data to factor if predictors are logical. (as.factor() on logical always puts FALSE as the first level, which is what we want for predictor.)
# 2. turn NA into (Missing) factor level so that lm will not drop all the rows.
# For ranger, we need to convert logical to factor too since na.roughfix only works for numeric or factor.
df[[col]] <- forcats::fct_explicit_na(as.factor(df[[col]]))
} else if(!is.numeric(df[[col]])) {
# 1. convert data to factor if predictors are not numeric or logical
# and limit the number of levels in factor by fct_lump.
# we use ties.method to handle the case where there are many unique values. (without it, they all survive fct_lump.)
# TODO: see if ties.method would make sense for calc_feature_imp.
# 2. turn NA into (Missing) factor level so that lm will not drop all the rows.
df[[col]] <- forcats::fct_explicit_na(forcats::fct_lump(forcats::fct_infreq(as.factor(df[[col]])), n=predictor_n, ties.method="first", other_level=other_level))
} else if(is.integer(df[[col]])) {
# Convert integer to numeric. mmpf::marginalPrediction we use for partial dependence throws assertion error, if the data is integer and specified grid points are not integer.
# To avoid something like that, we just convert integer to numeric before building predictive models.
df[[col]] <- as.numeric(df[[col]])
}
}
# If target is factor, turn it into unordered factor if it is not already.
if (is.factor(df[[target_col]])) {
df[[target_col]] <- factor(df[[target_col]], ordered=FALSE)
}
# remove columns with only one unique value
cols_copy <- c_cols
for (col in cols_copy) {
unique_val <- unique(df[[col]])
if (length(unique_val[!is.na(unique_val)]) == 1) {
c_cols <- setdiff(c_cols, col)
df[[col]] <- NULL # drop the column so that SMOTE will not see it.
}
}
if (length(c_cols) == 0) {
# Previous version of message - stop("The selected predictor variables are invalid since they have only one unique values.")
stop("Invalid Predictors: Only one unique value.") # Message is made short so that it fits well in the Summary table.
}
attr(df, 'predictors') <- c_cols # Pass up list of updated predictor column names.
attr(df, 'name_map') <- name_map # Pass up list of updated column name map. Just for legacy Date/POSIXct column handling.
df
}
remove_outliers_for_regression_data <- function(df, target_col, predictor_cols,
target_outlier_filter_type,
target_outlier_filter_threshold,
predictor_outlier_filter_type,
predictor_outlier_filter_threshold) {
if (!is.null(target_outlier_filter_type) || !is.null(predictor_outlier_filter_type)) {
df$.is.outlier <- FALSE #TODO: handle possibility of name conflict.
if (!is.null(target_outlier_filter_type)) {
is_outlier <- function(x) {
res <- detect_outlier(x, type=target_outlier_filter_type, threshold=target_outlier_filter_threshold) %in% c("Lower", "Upper")
res
}
if (is.numeric(df[[target_col]])) {
df$.is.outlier <- df$.is.outlier | is_outlier(df[[target_col]])
}
}
if (!is.null(predictor_outlier_filter_type)) {
is_outlier <- function(x) {
res <- detect_outlier(x, type=predictor_outlier_filter_type, threshold=predictor_outlier_filter_threshold) %in% c("Lower", "Upper")
res
}
for (col in predictor_cols) {
if (is.numeric(df[[col]])) {
df$.is.outlier <- df$.is.outlier | is_outlier(df[[col]])
}
}
}
df <- df %>% dplyr::filter(!.is.outlier)
df$.is.outlier <- NULL # Removing the temporary column.
}
df
}
#' builds lm model quickly for analytics view.
#' @param seed - Random seed to control data sampling, SMOTE, and bootstrapping for confidence interval of relative importance.
#' @param test_rate Ratio of test data
#' @export
build_lm.fast <- function(df,
target,
...,
target_fun = NULL,
predictor_funs = NULL,
weight = NULL,
weight_fun = NULL,
model_type = "lm",
family = NULL,
link = NULL,
max_nrow = 50000,
predictor_n = 12, # so that at least months can fit in it.
normalize_target = FALSE,
normalize_predictors = FALSE,
target_outlier_filter_type = NULL,
target_outlier_filter_threshold = NULL,
predictor_outlier_filter_type = NULL,
predictor_outlier_filter_threshold = NULL,
smote = FALSE,
smote_target_minority_perc = 40,
smote_max_synth_perc = 200,
smote_k = 5,
importance_measure = "permutation", # "firm", or "permutation" if available. Defaulting to permutation for backward compatibility for pre-6.5.
with_marginal_effects = FALSE,
with_marginal_effects_confint = FALSE,
variable_metric = NULL,
p_value_threshold = 0.05,
max_pd_vars = 20,
pd_sample_size = 500,
pd_grid_resolution = 20,
seed = 1,
test_rate = 0.0,
test_split_type = "random" # "random" or "ordered"
){
target_col <- tidyselect::vars_select(names(df), !! rlang::enquo(target))
orig_selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...))
target_funs <- NULL
if (!is.null(target_fun)) {
target_funs <- list(target_fun)
names(target_funs) <- target_col
df <- df %>% mutate_predictors(target_col, target_funs)
}
if (!is.null(predictor_funs)) {
df <- df %>% mutate_predictors(orig_selected_cols, predictor_funs)
selected_cols <- names(unlist(predictor_funs))
}
else {
selected_cols <- orig_selected_cols
}
weight_col <- tidyselect::vars_select(names(df), !! rlang::enquo(weight))
if (is.null(weight_col) || length(weight_col) == 0) { # It seems that if weight_col is not specified weight_col can be named character(0), which can be detected by length(weight_col)=0.
weight_col <- NULL
}
if (!is.null(weight_col) && !is.null(weight_fun)) {
weight_funs <- list(weight_fun)
names(weight_funs) <- weight_col
df <- df %>% mutate_predictors(weight_col, weight_funs)
}
if (!is.null(weight_col) && min(df[[weight_col]], na.rm=TRUE) <= 0) {
# We enforce strict positive values because with weights including 0, the model throws error at prediction.
stop("EXP-ANA-10 :: [] :: Weight column must be positive.")
}
if (!is.null(weight_col)) {
# Fill NA weight with 1.
df <- df %>% dplyr::mutate(!!rlang::sym(weight_col) := ifelse(is.na(!!rlang::sym(weight_col)), 1, !!rlang::sym(weight_col)))
}
grouped_cols <- grouped_by(df)
if (!is.null(variable_metric) && variable_metric == "ame") { # Special argument for integration with Analytics View.
with_marginal_effects <- TRUE
}
if (model_type == "glm" && is.null(family)) {
family <- "binomial" # default for glm is logistic regression.
link <- "logit"
}
# For backward compatibility, importance_measure defaults to "permutation",
# but these GLMs do not have permutation importance support. Fall back to FIRM.
if (importance_measure == "permutation" &&
model_type == "glm" && family %in% c("Gamma", "negativebinomial", "inverse.gaussian")) {
importance_measure <- "firm"
}
if (model_type == "glm" && family == "binomial" && (is.null(link) || link == "logit")) {
if (!is.logical(df[[target_col]]) && !is.numeric(df[[target_col]])) {
# numeric still works, but our official guidance will be to use logical.
stop("Target variable for logistic regression must be a logical.")
}
}
# If we do permutation importance, sort predictors so that the result of it is stable against change of column order.
# Otherwise, avoid sorting so that user has control over the order of variables on partial dependence plot.
if (model_type == "lm" || family %in% c("binomial", "gaussian") || (family == "poisson" && (is.null(link) || link == "log"))) {
selected_cols <- stringr::str_sort(selected_cols)
}
if(test_rate < 0 | 1 < test_rate){
stop("test_rate must be between 0 and 1")
} else if (test_rate == 1){
stop("test_rate must be less than 1")
}
# drop unrelated columns so that SMOTE later does not have to deal with them.
# select_ was not able to handle space in target_col. let's do it in base R way.
df <- df[,colnames(df) %in% c(grouped_cols, selected_cols, target_col, weight_col), drop=FALSE]
# Remove grouped col or target col. Weight col is not removed because it can be one of the predictors.
selected_cols <- setdiff(selected_cols, c(grouped_cols, target_col))
if (any(c(target_col, selected_cols, weight_col) %in% grouped_cols)) {
stop("grouping column is used as variable columns")
}
if (predictor_n < 2) {
stop("Max # of categories for explanatory vars must be at least 2.")
}
orig_levels <- NULL # For recording original factor levels for binary classification, to show data size for each class in Summary View.
if (!is.null(model_type) && model_type == "glm" && family %in% c("binomial", "quasibinomial")) {
# binomial case.
unique_val <- unique(df[[target_col]])
if (length(unique_val[!is.na(unique_val)]) != 2) {
stop(paste0("Column to predict (", target_col, ") with Binomial Regression must have 2 unique values."))
}
if (!is.numeric(df[[target_col]]) && !is.factor(df[[target_col]]) && !is.logical(df[[target_col]])) {
# make other types factor so that it passes stats::glm call.
df[[target_col]] <- factor(df[[target_col]])
}
# record original factor levels.
if (is.factor(df[[target_col]])) {
orig_levels <- levels(df[[target_col]])
# Keep only the ones that actually are used.
# One with larger index seems to be treated as TRUE (i.e. 1 in model$y) by glm.
orig_levels <- orig_levels[orig_levels %in% unique_val]
}
else if (is.logical(df[[target_col]])) {
orig_levels <- c("FALSE","TRUE")
}
}
else { # this means the model is lm or glm with family other than binomial
if (!is.numeric(df[[target_col]])) {
# TODO: message should handle other than lm too.
stop(paste0("Column to predict (", target_col, ") with Linear Regression must be numeric"))
}
}
# Replace spaces with dots in column names. margins::marginal_effects() fails without it.
clean_df <- df
# Replace column names with names like c1_, c2_...
# _ is so that name part and value part of categorical coefficient can be separated later,
# even with values that starts with number like "9E".
# We avoid following known issues by this.
# - Column name issue for marginal_effects(). Space is not handled well.
# ()"' are known to be ok as of version 5.5.2.
# - Column name issue for mmpf::marginalPrediction (for partial dependence). Comma is not handled well.
# Note that the data frames in source_data column are with the replaced column names for simplicity,
# rather than the original column names, unlike what we do for ranger or rpart. (Maybe we should do it for ranger or rpart.)
# The reverse mapping of the column names are done in augment.lm_exploratory or augment.glm_exploratory.
names(clean_df) <- paste0("c",1:length(colnames(clean_df)), "_")
# this mapping will be used to restore column names
name_map <- colnames(clean_df)
names(name_map) <- colnames(df)
# clean_names changes column names
# without chaning grouping column name
# information in the data frame
# and it causes an error,
# so the value of grouping columns
# should be still the names of grouping columns
name_map[grouped_cols] <- grouped_cols
colnames(clean_df) <- name_map
clean_target_col <- name_map[target_col]
if (!is.null(weight_col)) {
clean_weight_col <- name_map[weight_col]
}
else {
clean_weight_col <- NULL
}
clean_cols <- name_map[selected_cols]
each_func <- function(df) {
tryCatch({
if(!is.null(seed)){
set.seed(seed)
}
df_test <- NULL # declare variable for test data
df <- preprocess_regression_data_before_sample(df, clean_target_col, clean_cols)
clean_cols <- attr(df, 'predictors') # predictors are updated (removed) in preprocess_pre_sample. Catch up with it.
# Sample the data because randomForest takes long time if data size is too large.
# If we are to do SMOTE, do not down sample here and let exp_balance handle it so that we do not sample out precious minority data.
sampled_nrow <- NULL
if (!smote) {
if (!is.null(max_nrow) && nrow(df) > max_nrow) {
# Record that sampling happened.
sampled_nrow <- max_nrow
df <- df %>% sample_rows(max_nrow)
}
}
# Remove outliers if specified so.
# This has to be done before preprocess_regression_data_after_sample, since it can remove rows and reduce number of unique values,
# just like sampling.
df <- remove_outliers_for_regression_data(df, clean_target_col, clean_cols,
target_outlier_filter_type,
target_outlier_filter_threshold,
predictor_outlier_filter_type,
predictor_outlier_filter_threshold)
# Capture the classes of the columns at this point before preprocess_regression_data_after_sample,
# so that we know the original classes of columns before characters are turned into factors,
# so that we can sort the partial dependence data for display accordingly.
# preprocess_regression_data_after_sample can remove columns, but it should not cause problem that we have more columns in
# orig_predictor_classes than the partial dependence data.
# Also, preprocess_regression_data_after_sample has code to add columns extracted from Date/POSIXct, but with recent releases,
# that should not happen, since the extraction is already done by mutate_predictors.
orig_predictor_classes <- capture_df_column_classes(df, clean_cols)
df <- preprocess_regression_data_after_sample(df, clean_target_col, clean_cols, predictor_n = predictor_n, name_map = name_map)
c_cols <- attr(df, 'predictors') # predictors are updated (added and/or removed) in preprocess_post_sample. Catch up with it.
name_map <- attr(df, 'name_map')
# Normalize numeric target variable,
# after all column changes for Date/POSIXct, filtering, dropping columns above are done.
if (normalize_target) {
if (is.numeric(df[[clean_target_col]])) {
df[[clean_target_col]] <- normalize(df[[clean_target_col]])
}
}
# Normalize numeric predictors so that resulting coefficients are comparable among them,
# after all column changes for Date/POSIXct, filtering, dropping columns above are done.
if (normalize_predictors) {
for (col in c_cols) {
if (is.numeric(df[[col]])) {
df[[col]] <- normalize(df[[col]])
}
}
}
# Reverse mapping of variable names.
terms_mapping <- names(name_map)
names(terms_mapping) <- name_map
# build formula for lm
rhs <- paste0("`", c_cols, "`", collapse = " + ")
# TODO: This clean_target_col is actually not a cleaned column name since we want lm to show real name. Clean up our variable name.
fml <- as.formula(paste0("`", clean_target_col, "` ~ ", rhs))
if (model_type == "glm") {
if (smote) {
if (with_marginal_effects) {
# Keep the version of data before SMOTE,
# since we want to know average marginal effect on a data that has
# close distribution to the original data.
df_before_smote <- df
}
# Note: If there is weight column, we synthesize weight column as well.
df <- df %>% exp_balance(clean_target_col, target_size = max_nrow, target_minority_perc = smote_target_minority_perc, max_synth_perc = smote_max_synth_perc, k = smote_k)
df <- df %>% dplyr::select(-synthesized) # Remove synthesized column added by exp_balance(). TODO: Handle it better. We might want to show it in resulting data.
for(col in names(df)){
if(is.factor(df[[col]])) {
# margins::marginal_effects() fails if unused factor level exists. Drop them to avoid it.
# In case of SMOTE, this has to be done after that. TODO: Do this just once in any case.
df[[col]] <- forcats::fct_drop(df[[col]])
if (with_marginal_effects) {
df_before_smote <- df_before_smote %>% dplyr::filter(!!rlang::sym(col) %in% levels(df[[col]]))
df_before_smote[[col]] <- forcats::fct_drop(df_before_smote[[col]])
}
}
}
if (with_marginal_effects) {
# Sample df_before_smote for speed.
# We do not remove imbalance here to keep close distribution to the original data.
df_before_smote <- df_before_smote %>% sample_rows(max_nrow)
}
}
# split training and test data
source_data <- df
test_index <- sample_df_index(source_data, rate = test_rate, ordered = (test_split_type == "ordered"))
df <- safe_slice(source_data, test_index, remove = TRUE)
if (test_rate > 0) {
# Test mode. Make prediction with test data here, rather than repeating it in Analytics View preprocessors.
df_test <- safe_slice(source_data, test_index, remove = FALSE)
}
# When link is not specified, use default link function for each family,
# except for the case where family is negativebinomial, which we should use MASS::glm.nb
if (is.null(link) && family != "negativebinomial") {
if (is.null(clean_weight_col)) {
model <- stats::glm(fml, data = df, family = family)
}
else {
model <- stats::glm(fml, data = df, family = family, weights=df[[clean_weight_col]])
}
}
else {
if (family == "gaussian") {
family_arg <- stats::gaussian(link=link)
}
else if (family == "binomial") {
family_arg <- stats::binomial(link=link)
}
else if (family == "Gamma") {
family_arg <- stats::Gamma(link=link)
}
else if (family == "inverse.gaussian") {
family_arg <- stats::inverse.gaussian(link=link)
}
else if (family == "poisson") {
family_arg <- stats::poisson(link=link)
}
else if (family == "quasi") {
family_arg <- stats::quasi(link=link)
}
else if (family == "quasibinomial") {
family_arg <- stats::quasibinomial(link=link)
}
else if (family == "quasipoisson") {
family_arg <- stats::quasipoisson(link=link)
}
else if (family == "negativebinomial") {
family_arg <- family
}
else { # default gaussian
family_arg <- stats::gaussian(link=link)
}
if (family == "negativebinomial"){
# In MASS::glm.nb function, the link arg must be one of log, sqrt or identity.
# So if the other is used, the link arg should be set to "log" which is the default value.
if (is.null(link) || (!link %in% c("log", "sqrt", "identity"))){
link <- "log"
}
if (dplyr::n_distinct(df[[clean_target_col]]) == 1) {
# If only 1 unique value is there in target column, glm.nb seems to return error like following.
# Error in while ((it <- it + 1) < limit && abs(del) > eps) { :
# missing value where TRUE/FALSE needed
stop("Target column has only one unique value.")
}
# The argument link in MASS::glm.nb is evaluated by substitution with delay,
# so the variable specified in the argument is interpreted as the link argument as it is.
# For example, if you execute like MASS::glm.nb(fmt, data = df, link = link), the following error will occur
# link "link" not available for poisson family; available links are 'log', 'identity', 'sqrt'
# Therefore, we used eval to pass the string (log etc.) specified in the argument to link as it is.
if (is.null(clean_weight_col)) {
model <- eval(parse(text=paste0("MASS::glm.nb(fml, data=df, link=", link, ")")))
}
else {
model <- eval(parse(text=paste0("MASS::glm.nb(fml, data=df, link=", link, ", weights=df[[clean_weight_col]])")))
}
# A model by MASS::glm.nb has not a formula attribute.
model$formula <- fml
} else {
if (is.null(clean_weight_col)) {
model <- stats::glm(fml, data = df, family = family_arg)
}
else {
model <- stats::glm(fml, data = df, family = family_arg, weights=df[[clean_weight_col]])
}
}
}
if (length(c_cols) > 1) { # Skip importance calculation if there is only one variable.
if (importance_measure == "permutation") { # For firm case, we need to first calculate partial dependence.
if (family == "binomial") {
model$imp_df <- calc_permutation_importance_binomial(model, clean_target_col, c_cols, df)
}
else if (family == "poisson" && (is.null(link) || link == "log")) {
model$imp_df <- calc_permutation_importance_poisson(model, clean_target_col, c_cols, df)
}
else if (family == "gaussian") {
model$imp_df <- calc_permutation_importance_gaussian(model, clean_target_col, c_cols, df)
}
}
}
else {
error <- simpleError("Variable importance requires two or more variables.")
model$imp_df <- error
}
}
else { # model_type == "lm"
# split training and test data
source_data <- df
test_index <- sample_df_index(source_data, rate = test_rate, ordered = (test_split_type == "ordered"))
df <- safe_slice(source_data, test_index, remove = TRUE)
if (test_rate > 0) {
df_test <- safe_slice(source_data, test_index, remove = FALSE)
}
if (is.null(clean_weight_col)) {
model <- stats::lm(fml, data = df)
}
else {
model <- stats::lm(fml, data = df, weights=df[[clean_weight_col]])
}
if (length(c_cols) > 1) { # Skip importance calculation if there is only one variable.
if (importance_measure == "permutation") { # For firm case, we need to first calculate partial dependence.
model$imp_df <- calc_permutation_importance_linear(model, clean_target_col, c_cols, df)
}
}
else {
error <- simpleError("Variable importance requires two or more variables.")
model$imp_df <- error
}
}
# Strip environments to save rds size when cached.
if (!is.null(model$terms)) {
attr(model$terms,".Environment")<-NULL
}
if (!is.null(model$formula)) {
attr(model$formula,".Environment")<-NULL
}
if (!is.null(model$model) && !is.null(attr(model$model,"terms"))) {
attr(attr(model$model,"terms"),".Environment") <- NULL
}
tryCatch({
model$vif <- calc_vif(model, terms_mapping)
}, error = function(e){
model$vif <<- e
})
if (test_rate > 0) {
# Remove rows with categorical values which does not appear in training data and unknown to the model.
# Record where it was in unknown_category_rows_index, and keep it with model, so that prediction that
# matches with original data can be generated later.
df_test_clean <- cleanup_df_for_test(df_test, df, c_cols)
unknown_category_rows_index <- attr(df_test_clean, "unknown_category_rows_index")
# Note: Do not pass df_test like data=df_test. This for some reason ends up predict returning training data prediction.
model$prediction_test <- predict(model, df_test_clean, se.fit = TRUE)
model$prediction_test$unknown_category_rows_index <- unknown_category_rows_index
}
# these attributes are used in tidy of randomForest TODO: is this good for lm too?
model$terms_mapping <- terms_mapping
model$orig_levels <- orig_levels
# For displaying if sampling happened or not.
model$sampled_nrow <- sampled_nrow
# add special lm_exploratory class for adding extra info at glance().
if (model_type == "glm") {
class(model) <- c("glm_exploratory", class(model))
if (with_marginal_effects) { # For now, we have tested marginal_effects for logistic regression only. It seems to fail for probit for example.
if (smote) {
model$marginal_effects <- calc_average_marginal_effects(model, data=df_before_smote, with_confint=with_marginal_effects_confint) # This has to be done after glm_exploratory class name is set.
}
else {
model$marginal_effects <- calc_average_marginal_effects(model, with_confint=with_marginal_effects_confint) # This has to be done after glm_exploratory class name is set.
}
}
}
else {
class(model) <- c("lm_exploratory", class(model))
}
# Calculate variable importances.
if (!is.null(model$imp_df) && "error" %nin% class(model$imp_df)) { # if importance is available, show only max_pd_vars most important vars.
imp_df <- model$imp_df
imp_vars <- as.character((imp_df %>% arrange(-importance))$variable)
imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # Keep only max_pd_vars most important variables
}
else if (importance_measure == "firm") {
imp_vars <- c_cols # For FIRM, we need to calculate partial dependence for all predictors first.
}
else {
# Show only max_pd_vars most significant (ones with the smallest P values) vars.
# For categorical, pick the smallest P value among all classes of it.
c_cols_list <- as.list(c_cols)
# Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
# always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
# even for lm.
coef_df <- broom:::tidy.glm(model)
# str_detect matches with all categorical class terms that belongs to the variable.
p_values_list <- c_cols_list %>% purrr::map(function(x){(coef_df %>% filter(stringr::str_detect(term, paste0("^`?", x))) %>% summarize(p.value=min(p.value, na.rm=TRUE)))$p.value})
p_values_df <- tibble::tibble(term=c_cols, p.value=purrr::flatten_dbl(p_values_list))
imp_vars <- (p_values_df %>% dplyr::arrange(p.value))$term
imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # Keep only max_pd_vars most important variables
# We tried showing only significant variables, but decided oftentimes we wanted to see even insignificant ones. Keeping that code for now.
#
# signif_df <- broom::tidy(model) %>% filter(p.value < p_value_threshold) # One-liner to keep only significant predictors by matching names with result of tidy().
# if (nrow(signif_df) > 0) {
# imp_vars <- c_cols[sapply(c_cols,function(x){any(stringr::str_detect(signif_df$term, paste0("^`?", x)))})]
# }
# else {
# imp_vars <- c()
# }
}
if (length(imp_vars) > 0) {
model$imp_vars <- imp_vars
model$partial_dependence <- partial_dependence.lm_exploratory(model, target=clean_target_col, vars=imp_vars, data=df, n=c(pd_grid_resolution, min(nrow(df), pd_sample_size)))
}
else {
model$partial_dependence <- NULL
}
if (importance_measure == "firm") {
if (length(c_cols) > 1) { # Skip importance calculation if there is only one variable.
pdp_target_col <- attr(model$partial_dependence, "target")
imp_df <- importance_firm(model$partial_dependence, pdp_target_col, imp_vars)
model$imp_df <- imp_df
imp_vars <- imp_df$variable
model$imp_vars <- imp_vars
}
else {
error <- simpleError("Variable importance requires two or more variables.")
model$imp_df <- error
}
imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # take max_pd_vars most important variables
model$imp_vars <- imp_vars
# Shrink the partial dependence data keeping only the important variables.
model$partial_dependence <- shrink_partial_dependence_data(model$partial_dependence, imp_vars)
}
if (!is.null(target_funs)) {
model$orig_target_col <- target_col
model$target_funs <- target_funs
}
if (!is.null(predictor_funs)) {
model$orig_predictor_cols <- orig_selected_cols
attr(predictor_funs, "LC_TIME") <- Sys.getlocale("LC_TIME")
attr(predictor_funs, "sysname") <- Sys.info()[["sysname"]] # Save platform name (e.g. Windows) since locale name might need conversion for the platform this model will be run on.
attr(predictor_funs, "lubridate.week.start") <- getOption("lubridate.week.start")
model$predictor_funs <- predictor_funs
}
model$target_col <- clean_target_col # We use target column info to bring the column next to the predicted value in the prediction() output.
model$orig_predictor_classes <- orig_predictor_classes
list(model = model, test_index = test_index, source_data = source_data)
}, error = function(e){
if(length(grouped_cols) > 0) {
# In repeat-by case, we report group-specific error in the Summary table,
# so that analysis on other groups can go on.
if (model_type == "glm") {
class(e) <- c("glm_exploratory", class(e))
}
else {
class(e) <- c("lm_exploratory", class(e))
}
list(model = e, test_index = NULL, source_data = NULL)
} else {
stop(e)
}
})
}
model_and_data_col <- "model_and_data"
ret <- do_on_each_group(clean_df, each_func, name = model_and_data_col, with_unnest = FALSE)
if (length(grouped_cols) > 0) {
ret <- ret %>% tidyr::nest(-grouped_cols)
} else {
ret <- ret %>% tidyr::nest()
}
ret <- ret %>% dplyr::ungroup() # Remove rowwise grouping so that following mutate works as expected.
ret <- ret %>% dplyr::mutate(model = purrr::map(data, function(df){
df[[model_and_data_col]][[1]]$model
})) %>%
dplyr::mutate(.test_index = purrr::map(data, function(df){
df[[model_and_data_col]][[1]]$test_index
})) %>%
dplyr::mutate(source.data = purrr::map(data, function(df){
data <- df[[model_and_data_col]][[1]]$source_data
if (length(grouped_cols) > 0 && !is.null(data)) {
data %>% dplyr::select(-grouped_cols)
} else {
data
}
})) %>%
dplyr::select(-data)
# Rowwise grouping has to be redone with original grouped_cols, so that summarize(tidy(model)) later can add back the group column.
if (length(grouped_cols) > 0) {
ret <- ret %>% dplyr::rowwise(grouped_cols)
} else {
ret <- ret %>% dplyr::rowwise()
}
ret
}
#' special version of glance.lm function to use with build_lm.fast.
#' @export
glance.lm_exploratory <- function(x, pretty.name = FALSE, ...) { #TODO: add test
if ("error" %in% class(x)) {
ret <- data.frame(Note = x$message)
return(ret)
}
ret <- broom:::glance.lm(x)
# Add Max VIF if VIF is available.
if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
vif_df <- vif_to_dataframe(x)
if (nrow(vif_df) > 0 ) {
max_vif <- max(vif_df$VIF, na.rm=TRUE)
ret <- ret %>% dplyr::mutate(`Max VIF`=!!max_vif)
}
}
# Adjust the subtle difference between sigma (Residual Standard Error) and RMSE.
# In RMSE, division is done by observation size, while it is by residual degree of freedom in sigma.
# https://www.rdocumentation.org/packages/sjstats/versions/0.17.4/topics/cv
# https://stat.ethz.ch/pipermail/r-help/2012-April/308935.html
rmse_val <- sqrt(ret$sigma^2 * x$df.residual / nrow(x$model))
sample_size <- nrow(x$model)
ret <- ret %>% dplyr::mutate(rmse=!!rmse_val, n=!!sample_size)
# Drop sigma in favor of rmse.
ret <- ret %>% dplyr::select(r.squared, adj.r.squared, rmse, statistic, p.value, n, everything(), -sigma)
if(pretty.name) {
ret <- ret %>% dplyr::rename(`R Squared`=r.squared, `Adj R Squared`=adj.r.squared, `RMSE`=rmse, `F Value`=statistic, `P Value`=p.value, `DF`=df, `Log Likelihood`=logLik, `Residual Deviance`=deviance, `Residual DF`=df.residual, `Rows`=n)
# Place the Max VIF column after the Residual Deviance.
# ret <- ret %>% dplyr::relocate(`Residual DF`, .after=`Residual Deviance`)
# Note column might not exist. Rename if it is there.
colnames(ret)[colnames(ret) == "note"] <- "Note"
}
if (!is.null(ret$nobs)) { # glance.glm's newly added nobs seems to be same as Number of Rows. Suppressing it for now.
ret <- ret %>% dplyr::select(-nobs)
}
ret
}
#' special version of glance.lm function to use with build_lm.fast.
#' @export
glance.glm_exploratory <- function(x, pretty.name = FALSE, binary_classification_threshold = 0.5, ...) { #TODO: add test
if ("error" %in% class(x)) {
ret <- data.frame(Note = x$message)
return(ret)
}
ret <- broom:::glance.glm(x)
# calculate model p-value since glm does not provide it as is.
# https://stats.stackexchange.com/questions/129958/glm-in-r-which-pvalue-represents-the-goodness-of-fit-of-entire-model
f0 <- x$formula # copy formula as a basis for null model.
attr(f0,".Environment")<-environment() # Since we removed .Environment attribute for size reduction, add it back so that the following process works.
lazyeval::f_rhs(f0) <- 1 # create null model formula.
x0 <- glm(f0, x$model, family = x$family) # build null model. Use x$model rather than x$data since x$model seems to be the data after glm handled missingness.
pvalue <- with(anova(x0,x),pchisq(Deviance,Df,lower.tail=FALSE)[2])
if(pretty.name) {
ret <- ret %>% dplyr::mutate(`P Value`=!!pvalue, `Rows`=!!length(x$y))
}
else {
ret <- ret %>% dplyr::mutate(p.value=!!pvalue, n=!!length(x$y))
}
# For GLM (Negative Binomial)
if("negbin" %in% class(x)) {
if(pretty.name) {
ret <- ret %>% dplyr::mutate(`Theta`=!!(x$theta), `SE Theta`=!!(x$SE.theta))
}
else {
ret <- ret %>% dplyr::mutate(theta=!!(x$theta), SE.theta=!!(x$SE.theta))
}
}
if (x$family$family %in% c('gaussian')) { # only for gaussian, which is same as linear regression (if link function is identity).
root_mean_square_error <- rmse(x$fitted.values, x$y)
rsq <- r_squared(x$y, x$fitted.values)
n_observations <- nrow(x$model)
df_residual <- x$df.residual
adj_rsq <- adjusted_r_squared(rsq, n_observations, df_residual)
ret$rmse <- root_mean_square_error
ret$r.squared <- rsq
ret$adj.r.squared <- adj_rsq
}
if (x$family$family %in% c('binomial', 'quasibinomial')) { # only for logistic regression.
# Calculate F Score, Accuracy Rate, Misclassification Rate, Precision, Recall, Number of Rows
threshold_value <- if (is.numeric(binary_classification_threshold)) {
binary_classification_threshold
} else {
get_optimized_score(x$y, x$fitted.value, threshold = binary_classification_threshold)$threshold
}
if (is.null(threshold_value)) { # Could not get optimized value. To avoid error, it has to have some numeric value.
threshold_value <- 0.5
}
predicted <- ifelse(x$fitted.value > threshold_value, 1, 0) #TODO make threshold adjustable
ret2 <- evaluate_classification(x$y, predicted, 1, pretty.name = pretty.name)
ret2 <- ret2[, 2:6]
ret <- ret %>% bind_cols(ret2)
# calculate AUC
ret$auc <- auroc(x$fitted.value, x$y)
# Show number of rows for positive case and negative case, especially so that result of SMOTE is visible.
ret$positives <- sum(x$y == 1, na.rm = TRUE)
ret$negatives <- sum(x$y != 1, na.rm = TRUE)
}
# Add Max VIF if VIF is available.
if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
vif_df <- vif_to_dataframe(x)
if (nrow(vif_df) > 0 ) {
max_vif <- max(vif_df$VIF, na.rm=TRUE)
ret <- ret %>% dplyr::mutate(`Max VIF`=!!max_vif)
}
}
if(pretty.name) {
if (x$family$family %in% c('binomial', 'quasibinomial')) { # for binomial regressions.
colnames(ret)[colnames(ret) == "null.deviance"] <- "Null Deviance"
colnames(ret)[colnames(ret) == "df.null"] <- "Null Model DF"
colnames(ret)[colnames(ret) == "logLik"] <- "Log Likelihood"
colnames(ret)[colnames(ret) == "deviance"] <- "Residual Deviance"
colnames(ret)[colnames(ret) == "df.residual"] <- "Residual DF"
colnames(ret)[colnames(ret) == "auc"] <- "AUC"
ret <- ret %>% dplyr::select(AUC, `F1 Score`, `Accuracy Rate`, `Misclass. Rate`, `Precision`, `Recall`, `P Value`, `Rows`, positives, negatives, `Log Likelihood`, `AIC`, `BIC`, `Residual Deviance`, `Residual DF`, `Null Deviance`, `Null Model DF`, everything())
if (!is.null(x$orig_levels)) {
pos_label <- x$orig_levels[2]
neg_label <- x$orig_levels[1]
}
else {
# This should be only numeric case.
# In case of 0 and 1, this is making sense.
# But it seems the input can be numbers between 0 and 1 like 0.5 too.
# TODO: Look into how to handle such case.
pos_label <- "TRUE"
neg_label <- "FALSE"
}
colnames(ret)[colnames(ret) == "positives"] <- paste0("Rows for ", pos_label)
colnames(ret)[colnames(ret) == "negatives"] <- paste0("Rows for ", neg_label)
}
else { # for other numeric regressions.
colnames(ret)[colnames(ret) == "null.deviance"] <- "Null Deviance"
colnames(ret)[colnames(ret) == "df.null"] <- "Null Model DF"
colnames(ret)[colnames(ret) == "logLik"] <- "Log Likelihood"
colnames(ret)[colnames(ret) == "deviance"] <- "Residual Deviance"
colnames(ret)[colnames(ret) == "df.residual"] <- "Residual DF"
colnames(ret)[colnames(ret) == "rmse"] <- "RMSE"
colnames(ret)[colnames(ret) == "r.squared"] <- "R Squared"
colnames(ret)[colnames(ret) == "adj.r.squared"] <- "Adj R Squared"
ret <- ret %>% dplyr::select(matches("^R Squared$"), matches("^Adj R Squared$"), matches("^RMSE$"), `P Value`, `Rows`, `Log Likelihood`, `AIC`, `BIC`, `Residual Deviance`, `Residual DF`, `Null Deviance`, `Null Model DF`, everything())
}
}
if (!is.null(ret$nobs)) { # glance.glm's newly added nobs seems to be same as Number of Rows. Suppressing it for now.
ret <- ret %>% dplyr::select(-nobs)
}
ret
}
# Creates a data frame that maps term name to its base level.
xlevels_to_base_level_table <- function(xlevels) {
term <- purrr::flatten_chr(purrr::map(names(xlevels), function(vname) {
# Quote variable name with backtick if it includes special characters or space.
# Special characters to detect besides space. Note that period and underscore should *not* be included here. : ~!@#$%^&*()+={}|:;'<>,/?"[]-\
# Using grepl() as opposed to str_detect() because str_detect seems to return wrong decision when vname ends with SJIS damemoji.
# perl=TRUE is required here, since it seems this regex does not detect space, tilde, etc. without perl=TRUE for some reason.
paste0(if_else(grepl("[ ~!@#$%^&*()+={}|:;'<>,/?\"\\[\\]\\-\\\\]", vname, perl=TRUE), paste0('`',vname,'`'),vname),xlevels[[vname]])
}))
base_level <- purrr::flatten_chr(purrr::map(xlevels, function(v){rep(v[[1]],length(v))}))
ret <- data.frame(term=term, base.level=base_level)
ret
}
# Takes lm/glm model with vif (variance inflation factor) and returns data frame with extracted info.
vif_to_dataframe <- function(x) {
ret <- NULL
if (is.matrix(x$vif)) {
# It is a matrix when there are categorical variables.
ret <- x$vif %>% as.data.frame() %>% tibble::rownames_to_column(var="term") %>% rename(VIF=GVIF)
}
else {
# It is a vector when there is no categorical variable.
ret <- data.frame(term=names(x$vif), VIF=x$vif)
}
# Eliminate negative VIF values that sometimes happen, most likely because of numeric error.
ret <- ret %>% dplyr::mutate(VIF = pmax(VIF, 0))
# Map variable names back to the original.
# as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
ret$term <- x$terms_mapping[as.character(ret$term)]
ret
}
# From name of variable, returns possible names of terms returned from lm/glm.
var_to_possible_terms_lm <- function(var, x) {
if (is.factor(x$model[[var]])) {
# Possibly, the variable name in the term name is quoted with backtic.
c(paste0(var, levels(x$model[[var]])),
paste0('`', var, '`', levels(x$model[[var]])))
}
else {
# Possibly, the term name is quoted with backtic.
c(var, paste0('`', var, '`'))
}
}
# From name of variable, returns possible names of terms returned from coxph.
var_to_possible_terms_coxph <- function(var, x) {
if (!is.null(x$xlevels[[var]])) {
# Possibly, the variable name in the term name is quoted with backtic.
paste0(var, x$xlevels[[var]])
}
else {
# Possibly, the term name is quoted with backtic.
var
}
}
# Returns P-value for the variable. For categorical, the smallest value is returned.
# For the color of relative importance bar chart.
get_var_min_pvalue <- function(var, coef_df, x) {
if ("coxph" %in% class(x)) {
terms <- var_to_possible_terms_coxph(as.character(var), x)
}
else { # We assume it is lm or glm here.
terms <- var_to_possible_terms_lm(as.character(var), x)
}
# na.rm is necessary to handle the case where part of categorical variables are dropped due to perfect collinearity.
min(coef_df$p.value[coef_df$term %in% terms], na.rm=TRUE)
}
#' special version of tidy.lm function to use with build_lm.fast.
#' @export
tidy.lm_exploratory <- function(x, type = "coefficients", pretty.name = FALSE, ...) { #TODO: add test
if ("error" %in% class(x)) {
ret <- data.frame()
return(ret)
}
switch(type,
coefficients = {
# Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
# always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
# even for lm.
ret <- broom:::tidy.glm(x)
ret <- ret %>% mutate(conf.low=estimate-1.96*std.error, conf.high=estimate+1.96*std.error)
base_level_table <- xlevels_to_base_level_table(x$xlevels)
# Convert term from factor to character to remove warning at left_join.
ret <- ret %>% dplyr::mutate(term=as.character(term)) %>% dplyr::left_join(base_level_table, by="term")
if (any(is.na(x$coefficients))) {
# since broom skips coefficients with NA value, which means removed by lm because of multi-collinearity,
# put it back to show them.
# reference: https://stats.stackexchange.com/questions/25804/why-would-r-return-na-as-a-lm-coefficient
removed_coef_df <- data.frame(term=names(x$coefficients[is.na(x$coefficients)]), note="Dropped most likely due to perfect multicollinearity.")
ret <- ret %>% dplyr::bind_rows(removed_coef_df)
if (pretty.name) {
ret <- ret %>% rename(Note=note)
}
}
# Map coefficient names back to original.
ret$term <- map_terms_to_orig(ret$term, x$terms_mapping)
if (pretty.name) {
ret <- ret %>% rename(Term=term, Coefficient=estimate, `Std Error`=std.error,
`t Value`=statistic, `P Value`=p.value,
`Conf Low`=conf.low,
`Conf High`=conf.high,
`Base Level`=base.level)
}
ret
},
vif = {
if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
ret <- vif_to_dataframe(x)
}
else {
ret <- data.frame() # Skip output for this group. TODO: Report error in some way.
}
ret
},
partial_dependence = {
handle_partial_dependence(x)
},
importance = {
if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
# Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
# Return empty data.frame to avoid error.
ret <- data.frame()
return(ret)
}
ret <- x$imp_df
# Add p.value column.
# Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
# always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
# even for lm.
coef_df <- broom:::tidy.glm(x)
ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
get_var_min_pvalue(var, coef_df, x)
})) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
# Map variable names back to the original.
# as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
ret$variable <- x$terms_mapping[as.character(ret$variable)]
ret
},
# This is copy of the code for "importance" with adjustment for output column name just for backward compatibility for pre-6.5.
# Remove at appropriate timing.
permutation_importance = {
if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
# Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
# Return empty data.frame to avoid error.
ret <- data.frame()
return(ret)
}
ret <- x$imp_df
# Add p.value column.
# Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
# always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
# even for lm.
coef_df <- broom:::tidy.glm(x)
ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
get_var_min_pvalue(var, coef_df, x)
})) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
# Map variable names back to the original.
# as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
ret$variable <- x$terms_mapping[as.character(ret$variable)]
ret <- ret %>% dplyr::rename(term=variable)
ret
}
)
}
#' Special version of tidy.glm function to use with build_lm.fast.
#' In case of error, returns empty data frame, or data frame with Note column.
#' @export
tidy.glm_exploratory <- function(x, type = "coefficients", pretty.name = FALSE, variable_metric = NULL, converged_only = FALSE, ...) { #TODO: add test
if ("error" %in% class(x)) {
ret <- data.frame()
return(ret)
}
# Skip if model did not converge. We are using this to skip coefficient scatter plot with very large coefficients,
# or if odds ratio is used, Inf or 0 odds ratios.
if (converged_only && !x$converged) {
ret <- data.frame()
return(ret)
}
switch(type,
coefficients = {
ret <- broom:::tidy.glm(x)
ret <- ret %>% mutate(conf.low=estimate-1.96*std.error, conf.high=estimate+1.96*std.error)
if (x$family$family == "binomial" && x$family$link == "logit") { # odds ratio is only for logistic regression
ret <- ret %>% mutate(odds_ratio=exp(estimate))
if (!is.null(variable_metric) && variable_metric == "odds_ratio") { # For Analytics View, overwrite conf.low/conf.high with those of odds ratio.
ret <- ret %>% mutate(conf.low=exp(conf.low), conf.high=exp(conf.high))
ret <- ret %>% select(term, odds_ratio, conf.low, conf.high, everything()) # Bring odds ratio upfront together with its confidence interval.
}
else {
ret <- ret %>% select(term, estimate, conf.low, conf.high, everything()) # Bring coefficient upfront together with its confidence interval.
}
}
if (!is.null(x$marginal_effects)) {
# Convert term from factor to character to remove warning at left_join.
ret <- ret %>% dplyr::mutate(term=as.character(term)) %>% dplyr::left_join(x$marginal_effects, by="term")
# Adjust order of columns
if (!is.null(ret$ame)) {
ret <- ret %>% relocate(ame, .after=term)
if (!is.null(ret$ame_low)) {
ret <- ret %>% relocate(ame_high, ame_low, .after=ame)
}
}
}
base_level_table <- xlevels_to_base_level_table(x$xlevels)
# Convert term from factor to character to remove warning at left_join.
ret <- ret %>% dplyr::mutate(term=as.character(term)) %>% dplyr::left_join(base_level_table, by="term")
if (any(is.na(x$coefficients))) {
# since broom skips coefficients with NA value, which means removed by lm because of multi-collinearity,
# put it back to show them.
# reference: https://stats.stackexchange.com/questions/25804/why-would-r-return-na-as-a-lm-coefficient
removed_coef_df <- data.frame(term=names(x$coefficients[is.na(x$coefficients)]), note="Dropped most likely due to perfect multicollinearity.")
ret <- ret %>% dplyr::bind_rows(removed_coef_df)
if (pretty.name) {
ret <- ret %>% rename(Note=note)
}
}
# Map coefficient names back to original.
ret$term <- map_terms_to_orig(ret$term, x$terms_mapping)
if (pretty.name) {
# Rename to pretty names
ret <- ret %>% rename(Term=term, Coefficient=estimate, `Std Error`=std.error,
`t Value`=statistic, `P Value`=p.value, `Conf Low`=conf.low, `Conf High`=conf.high,
`Base Level`=base.level)
if (!is.null(ret$ame)) {
ret <- ret %>% rename(`Average Marginal Effect`=ame)
}
if (!is.null(ret$ame_low)) {
ret <- ret %>% rename(`AME Low`=ame_low,`AME High`=ame_high)
}
if (x$family$family == "binomial" && x$family$link == "logit") { # odds ratio is only for logistic regression
ret <- ret %>% rename(`Odds Ratio`=odds_ratio)
}
}
ret
},
conf_mat = {
target_col <- as.character(lazyeval::f_lhs(x$formula)) # get target column name
actual_val = x$model[[target_col]]
predicted = x$fitted.value > 0.5 # TODO: make threshold adjustable. Note: This part of code seems to be unused. Check and remove.
# convert predicted to original set of values. should be either logical, numeric, or factor.
predicted <- if (is.logical(actual_val)) {
predicted
} else if (is.numeric(actual_val)) {
as.numeric(predicted)
} else if (is.factor(actual_val)){
# create a factor vector with the same levels as actual_val
# predicted is logical, so should +1 to make it index
factor(levels(actual_val)[as.numeric(predicted) + 1], levels(actual_val))
}
ret <- data.frame(
actual_value = actual_val,
predicted_value = predicted
) %>%
dplyr::filter(!is.na(predicted_value))
# get count if it's classification
ret <- ret %>%
dplyr::group_by(actual_value, predicted_value) %>%
dplyr::summarize(count = n()) %>%
dplyr::ungroup()
ret
},
vif = {
if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
ret <- vif_to_dataframe(x)
}
else {
ret <- data.frame() # Skip output for this group. TODO: Report error in some way.
}
ret
},
partial_dependence = {
handle_partial_dependence(x)
},
importance = {
if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
# Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
# Return empty data.frame to avoid error.
ret <- data.frame()
return(ret)
}
ret <- x$imp_df
# Add p.value column.
coef_df <- broom:::tidy.glm(x)
ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
get_var_min_pvalue(var, coef_df, x)
})) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
# Map variable names back to the original.
# as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
ret$variable <- x$terms_mapping[as.character(ret$variable)]
ret
},
# This is copy of the code for "importance" with adjustment for output column name just for backward compatibility for pre-6.5.
# Remove at appropriate timing.
permutation_importance = {
if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
# Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
# Return empty data.frame to avoid error.
ret <- data.frame()
return(ret)
}
ret <- x$imp_df
# Add p.value column.
coef_df <- broom:::tidy.glm(x)
ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
get_var_min_pvalue(var, coef_df, x)
})) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
# Map variable names back to the original.
# as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
ret$variable <- x$terms_mapping[as.character(ret$variable)]
ret <- ret %>% dplyr::rename(term=variable)
ret
}
)
}
#' wrapper for tidy type partial dependence
#' @export
lm_partial_dependence <- function(df, ...) { # TODO: write test for this.
res <- df %>% tidy_rowwise(model, type="partial_dependence", ...)
if (nrow(res) == 0) {
return(data.frame()) # Skip the rest of processing by returning empty data.frame.
}
grouped_col <- grouped_by(res) # When called from analytics view, this should be a single column or empty.
# grouped_by has to be on res rather than on df since dplyr::group_vars
# does not work on rowwise-grouped data frame.
if (length(grouped_col) > 0) {
res <- res %>% dplyr::ungroup() # ungroup to mutate group_by column.
# Folloing is not necessary since we separately display partial dependence plot for each group since v5.5.
# add variable name to the group_by column, so that chart is repeated by the combination of group_by column and variable name.
# res[[grouped_col]] <- paste(as.character(res[[grouped_col]]), res$x_name)
res[[grouped_col]] <- forcats::fct_inorder(factor(res[[grouped_col]])) # set order to appear as facets
res <- res %>% dplyr::group_by(!!!rlang::syms(grouped_col)) # put back group_by for consistency
}
else {
res$x_name <- forcats::fct_inorder(factor(res$x_name)) # set order to appear as facets
}
# gather we did after edarf::partial_dependence call turned x_value into factor if not all variables were in a same data type like numeric.
# to keep the numeric or factor order (e.g. Sun, Mon, Tue) of x_value in the resulting chart, we do fct_inorder here while x_value is in order.
# the first factor() is for the case x_value is not already a factor, to avoid error from fct_inorder()
res <- res %>% dplyr::mutate(x_value = forcats::fct_inorder(factor(x_value))) # TODO: if same number appears for different variables, order will be broken.
res
}
#' @export
augment.lm_exploratory <- function(x, data = NULL, newdata = NULL, data_type = "training", ...) {
if ("error" %in% class(x)) {
ret <- data.frame()
return(ret)
}
if(!is.null(newdata)) {
# Replay the mutations on predictors.
if(!is.null(x$predictor_funs)) {
newdata <- newdata %>% mutate_predictors(x$orig_predictor_cols, x$predictor_funs)
}
predictor_variables <- all.vars(x$terms)[-1]
predictor_variables_orig <- x$terms_mapping[predictor_variables]
# Rename columns via predictor_variables_orig, which is a named vector.
# TODO: What if names of the other columns conflicts with our temporary name, c1_, c2_...?
cleaned_data <- newdata %>% dplyr::rename(predictor_variables_orig)
# Align factor levels including Others and (Missing) to the model. TODO: factor level order can be different from the model training data. Is this ok?
cleaned_data <- align_predictor_factor_levels(cleaned_data, x$xlevels, predictor_variables)
na_row_numbers <- ranger.find_na(predictor_variables, cleaned_data)
if (length(na_row_numbers) > 0) {
cleaned_data <- cleaned_data[-na_row_numbers,]
}
ret <- broom:::augment.lm(x, data = NULL, newdata = cleaned_data, se = TRUE, ...)
# TODO: Restore removed rows.
} else if (!is.null(data)) {
data <- data %>% dplyr::relocate(!!rlang::sym(x$target_col), .after = last_col()) # Bring the target column to the last so that it is next to the predicted value in the output.
ret <- switch(data_type,
training = { # Call broom:::augment.lm as is
broom:::augment.lm(x, data = data, newdata = newdata, se = TRUE, ...)
},
test = {
# Augment data with already predicted result in the model.
data$.fitted <- restore_na(x$prediction_test$fit, x$prediction_test$unknown_category_rows_index)
data$.se.fit <- restore_na(x$prediction_test$se.fit, x$prediction_test$unknown_category_rows_index)
data
})
}
else {
ret <- broom:::augment.lm(x, se = TRUE, ...)
}
# Rename columns back to the original names.
names(ret) <- coalesce(x$terms_mapping[names(ret)], names(ret))
ret
}
#' @export
augment.glm_exploratory <- function(x, data = NULL, newdata = NULL, data_type = "training", binary_classification_threshold = 0.5, ...) {
if ("error" %in% class(x)) {
ret <- data.frame()
return(ret)
}
if(!is.null(newdata)) {
# Replay the mutations on predictors.
if(!is.null(x$predictor_funs)) {
newdata <- newdata %>% mutate_predictors(x$orig_predictor_cols, x$predictor_funs)
}
predictor_variables <- all.vars(x$terms)[-1]
predictor_variables_orig <- x$terms_mapping[predictor_variables]
# Rename columns via predictor_variables_orig, which is a named vector.
# TODO: What if names of the other columns conflicts with our temporary name, c1_, c2_...?
cleaned_data <- newdata %>% dplyr::rename(predictor_variables_orig)
# Align factor levels including Others and (Missing) to the model. TODO: factor level order can be different from the model training data. Is this ok?
cleaned_data <- align_predictor_factor_levels(cleaned_data, x$xlevels, predictor_variables)
na_row_numbers <- ranger.find_na(predictor_variables, cleaned_data)
if (length(na_row_numbers) > 0) {
cleaned_data <- cleaned_data[-na_row_numbers,]
}
ret <- tryCatch({
broom:::augment.glm(x, data = NULL, newdata = cleaned_data, se = TRUE, ...)
}, error = function(e){
# se=TRUE throws error that looks like "singular matrix 'a' in solve",
# in some subset of cases of perfect collinearity.
# Try recovering from it by running with se=FALSE.
broom:::augment.glm(x, data = NULL, newdata = cleaned_data, se = FALSE, ...)
})
} else if (!is.null(data)) {
# Bring the target column to the last so that it is next to the predicted value in the output.
# Note that source.data of lm/glm model df has mapped column names, unlike that of ranger.
data <- data %>% dplyr::relocate(!!rlang::sym(x$target_col), .after = last_col())
ret <- switch(data_type,
training = {
tryCatch({
broom:::augment.glm(x, data = data, newdata = newdata, se = TRUE, ...)
}, error = function(e){
broom:::augment.glm(x, data = data, newdata = newdata, se = FALSE, ...)
})
},
test = {
# Augment data with already predicted result in the model.
data$.fitted <- restore_na(x$prediction_test$fit, x$prediction_test$unknown_category_rows_index)
data$.se.fit <- restore_na(x$prediction_test$se.fit, x$prediction_test$unknown_category_rows_index)
data
})
}
else {
ret <- tryCatch({
broom:::augment.glm(x, se = TRUE, ...)
}, error = function(e){
broom:::augment.glm(x, se = FALSE, ...)
})
}
ret <- add_response(ret, x, "predicted_response") # Add response.
if (!is.null(ret$.fitted)) {
# Bring predicted_response column as the first of the prediction result related additional columns, so that it comes next to the actual values.
ret <- ret %>% dplyr::relocate(any_of(c("predicted_response")), .before=.fitted)
}
if (x$family$family == "binomial") { # Add predicted label in case of binomial (including logistic regression).
ret$predicted_label <- ret$predicted_response > binary_classification_threshold
}
# Rename columns back to the original names.
names(ret) <- coalesce(x$terms_mapping[names(ret)], names(ret))
ret
}
# For some reason, find_data called from inside margins::marginal_effects() fails in Exploratory.
# Explicitly declaring find_data for our glm_exploratory class works it around.
#' @export
find_data.glm_exploratory <- function(model, env = parent.frame(), ...) {
model$data
}
# Generates Summary table for Analytics View. It can handle Test Mode.
# This is written for linear regression analytics view and GLM analytics views that makes numeric prediction.
evaluate_lm_training_and_test <- function(df, pretty.name = FALSE){
# Get the summary row for traninng data. Info is retrieved from model by glance()
training_ret <- df %>% glance_rowwise(model, pretty.name = pretty.name)
ret <- training_ret
grouped_col <- colnames(df)[!colnames(df) %in% c("model", ".test_index", "source.data")]
# Consider it test mode if any of the element of .test_index column has non-zero length, and generate summary row for test data.
# Unlike training data, this involves calculating metrics by ourselves from test prediction result.
if (purrr::some(df$.test_index, function(x){length(x)!=0})) {
ret$is_test_data <- FALSE # Set is_test_data FALSE for training data. Add is_test_data column only when there are test data too.
each_func <- function(df){
# With the way this is called, df becomes list rather than data.frame.
# Make it data.frame again so that prediction() can be applied on it.
if (!is.data.frame(df)) {
df <- tibble::tribble(~model, ~.test_index, ~source.data,
df$model, df$.test_index, df$source.data)
}
tryCatch({
test_pred_ret <- prediction(df, data = "test")
## get Model Object
m <- (df %>% filter(!is.null(model)))$model[[1]]
actual_val_col <- all.vars(df$model[[1]]$terms)[[1]]
# Get original target column name.
actual_val_col_orig <- df$model[[1]]$terms_mapping[[actual_val_col]]
actual <- test_pred_ret[[actual_val_col_orig]]
predicted <- test_pred_ret$predicted_value
root_mean_square_error <- rmse(actual, predicted)
test_n <- sum(!is.na(predicted)) # Sample size for test.
# To calculate R Squared for test data, use same null model basis as training,
# so that the results are comparable.
null_model_mean <- mean(df$model[[1]]$model[[actual_val_col]], na.rm=TRUE)
rsq <- r_squared(actual, predicted, null_model_mean)
# Calculate Adjusted R Sauared
# https://en.wikipedia.org/wiki/Coefficient_of_determination
n_observations <- nrow(df$model[[1]]$model)
df_residual <- df$model[[1]]$df.residual
adj_rsq <- adjusted_r_squared(rsq, n_observations, df_residual)
test_ret <- data.frame(
r.squared = rsq,
adj.r.squared = adj_rsq,
rmse = root_mean_square_error,
n = test_n
)
if(pretty.name) {
test_ret <- test_ret %>% dplyr::rename(`R Squared`=r.squared, `Adj R Squared`=adj.r.squared, `RMSE`=rmse, `Rows`=n)
}
test_ret$is_test_data <- TRUE
test_ret
}, error = function(e){
data.frame()
})
}
# df is already grouped rowwise, but to get group column value on the output, we need to group it explicitly with the group column.
if (length(grouped_col) > 0) {
df <- df %>% dplyr::group_by(!!!rlang::syms(grouped_col))
}
test_ret <- do_on_each_group(df, each_func, with_unnest = TRUE)
ret <- ret %>% dplyr::bind_rows(test_ret)
}
# Reorder columns. Bring group_by column first, and then is_test_data column, if it exists.
if (!is.null(ret$is_test_data)) {
if (length(grouped_col) > 0) {
ret <- ret %>% dplyr::select(!!!rlang::syms(grouped_col), is_test_data, everything())
}
else {
ret <- ret %>% dplyr::select(is_test_data, everything())
}
}
else {
if (length(grouped_col) > 0) {
ret <- ret %>% dplyr::select(!!!rlang::syms(grouped_col), everything())
}
}
if (length(grouped_col) > 0){
ret <- ret %>% dplyr::arrange(!!!rlang::syms(grouped_col))
}
# Prettify is_test_data column. Do this after the above select calls, since it looks at is_test_data column.
if (!is.null(ret$is_test_data) && pretty.name) {
ret <- ret %>% dplyr::select(is_test_data, everything()) %>%
dplyr::mutate(is_test_data = dplyr::if_else(is_test_data, "Test", "Training")) %>%
dplyr::rename(`Data Type` = is_test_data)
}
# Bring Note column at the end.
if (!is.null(ret$Note)) {
ret <- ret %>% dplyr::select(-Note, everything(), Note)
}
ret
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.