R/build_lm.R

Defines functions evaluate_lm_training_and_test find_data.glm_exploratory augment.glm_exploratory augment.lm_exploratory lm_partial_dependence tidy.glm_exploratory tidy.lm_exploratory get_var_min_pvalue var_to_possible_terms_coxph var_to_possible_terms_lm vif_to_dataframe xlevels_to_base_level_table glance.glm_exploratory glance.lm_exploratory build_lm.fast remove_outliers_for_regression_data preprocess_regression_data_after_sample preprocess_regression_data_before_sample map_terms_to_orig build_lm tidy.lm_exploratory_0 augment.lm_exploratory_0 calc_vif vif calc_average_marginal_effects partial_dependence.lm_exploratory calc_permutation_importance_poisson calc_permutation_importance_gaussian calc_permutation_importance_linear calc_permutation_importance_binomial

# Calculates permutation importance for binomial (including logistic) regression.
calc_permutation_importance_binomial <- function(fit, target, vars, data) {
  var_list <- as.list(vars)
  importances <- purrr::map(var_list, function(var) {
    mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
                                predict.fun = function(object,newdata){predict(object,type = "response",newdata=newdata)},
                                # Use minus log likelyhood as loss function, since it is what logistic regression tried to optimise. 
                                loss.fun = function(x,y){-sum(log(1- abs(x - y)),na.rm = TRUE)})
  })
  importances <- purrr::flatten_dbl(importances)
  importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
  importances_df
}

# Calculates permutation importance for linear regression.
calc_permutation_importance_linear <- function(fit, target, vars, data) {
  var_list <- as.list(vars)
  importances <- purrr::map(var_list, function(var) {
    mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
                                # predict.fun can be the default function, which is predict(object, newdata=newdata).
                                # For some reason, default loss.fun, which is mean((x - y)^2) returns NA, even with na.rm=TRUE. Rewrote it with sum() to avoid the issue.
                                loss.fun = function(x,y){sum((x - y)^2, na.rm = TRUE)/length(x)})
  })
  importances <- purrr::flatten_dbl(importances)
  importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
  importances_df
}

# Calculates permutation importance for GLM Gaussian regression.
calc_permutation_importance_gaussian <- function(fit, target, vars, data) {
  var_list <- as.list(vars)
  importances <- purrr::map(var_list, function(var) {
    mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
                                # type needs to be "response" so that x in loss.fun can be used to calculate sum of squares with any choice of link function.
                                predict.fun = function(object,newdata){predict(object,type = "response",newdata=newdata)},
                                # For some reason, default loss.fun, which is mean((x - y)^2) returns NA, even with na.rm=TRUE. Rewrote it with sum() to avoid the issue.
                                loss.fun = function(x,y){sum((x - y)^2, na.rm = TRUE)/length(x)})
  })
  importances <- purrr::flatten_dbl(importances)
  importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
  importances_df
}

# Calculates permutation importance for logistic regression.
calc_permutation_importance_poisson <- function(fit, target, vars, data) {
  var_list <- as.list(vars)
  importances <- purrr::map(var_list, function(var) {
    mmpf::permutationImportance(data, var, target, fit, nperm = 1, # By default, it creates 100 permuted data sets. We do just 1 for performance.
                                # type needs to be "link" since x in loss.fun expects linear predictor.
                                predict.fun = function(object,newdata){predict(object,type = "link",newdata=newdata)},
                                # Use minus log likelyhood as loss function.
                                # Reference: https://en.wikipedia.org/wiki/Poisson_regression#Maximum_likelihood-based_parameter_estimation
                                #            https://stats.stackexchange.com/questions/70054/how-is-it-possible-that-poisson-glm-accepts-non-integer-numbers
                                loss.fun = function(x,y){-sum(y*x-exp(x), na.rm = TRUE)})
  })
  importances <- purrr::flatten_dbl(importances)
  importances_df <- tibble::tibble(variable=vars, importance=pmax(importances, 0))
  importances_df
}

# TODO: Make this function model-agnostic and consolidate. There are similar code for lm/glm, ranger, rpart, and xgboost.
# Builds partial_dependency object for lm/glm with same structure (a data.frame with attributes.) as edarf::partial_dependence.
partial_dependence.lm_exploratory <- function(fit, target, vars = colnames(data),
  n = c(min(nrow(unique(data[, vars, drop = FALSE])), 25L), nrow(data)), # Keeping same default of 25 as edarf::partial_dependence, although we usually overwrite from callers.
  interaction = FALSE, uniform = TRUE, data, ...) {

  predict.fun <- function(object, newdata) {
    predict(object, newdata = newdata, type = "response")
  }

  aggregate.fun <- function(x) {
    c("preds" = mean(x))
  }

  # Generate grid points based on quantile, so that FIRM calculated based on it would make good sense even when there are some outliers.
  points <- list()
  quantile_points <- list()
  for (cname in vars) {
    if (is.numeric(data[[cname]])) {
      coldata <- data[[cname]]
      minv <- min(coldata, na.rm=TRUE)
      maxv <- max(coldata, na.rm=TRUE)
      grid <- minv + (0:20)/20 * (maxv - minv)
      quantile_grid <- quantile(coldata, probs=1:24/25)
      quantile_points[[cname]] <- quantile_grid
      points[[cname]] <- sort(unique(c(grid, quantile_grid)))
    }
    else {
      points[[cname]] <- unique(data[[cname]])
    }
  }

  args = list(
    "data" = data,
    "vars" = vars,
    "n" = n,
    "model" = fit,
    "points" = points,
    "predict.fun" = predict.fun,
    "aggregate.fun" = aggregate.fun,
    ...
  )

  if (length(vars) > 1L & !interaction) { # More than one variables are there. Iterate calling mmpf::marginalPrediction.
    pd = rbindlist(sapply(vars, function(x) {
      args$vars = x
      if ("points" %in% names(args))
        args$points = args$points[x]
      mp = do.call(mmpf::marginalPrediction, args)
      names(mp)[ncol(mp)] = target
      mp
    }, simplify = FALSE), fill = TRUE)
    data.table::setcolorder(pd, c(vars, colnames(pd)[!colnames(pd) %in% vars]))
  } else {
    # Remove value label from vars to avoid unexpected column name in the result.
    args$vars = as.character(vars)
    pd = do.call(mmpf::marginalPrediction, args)
    names(pd)[ncol(pd)] = target
  }

  attr(pd, "class") = c("pd", "data.frame")
  attr(pd, "interaction") = interaction == TRUE
  attr(pd, "target") = target
  attr(pd, "vars") = vars
  attr(pd, "points") = points
  attr(pd, "quantile_points") = quantile_points
  pd
}


# Calculate average marginal effects from model with margins package.
calc_average_marginal_effects <- function(model, data=NULL, with_confint=FALSE) {
  if (with_confint) {
    if (!is.null(data)) {
      m <- margins::margins(model, data=data)
    }
    else {
      m <- margins::margins(model)
    }
    ret <- as.data.frame(summary(m))
    ret <- ret %>% dplyr::rename(term=factor, ame=AME, ame_low=lower, ame_high=upper) %>%
      dplyr::select(term, ame, ame_low, ame_high) #TODO: look into SE, z, p too.
    ret
  }
  else {
    # Fast versin that only calls margins::margins().
    # margins::margins() does a lot more than margins::marginal_effects(),
    # and takes about 10 times more time.
    if (!is.null(data)) {
      me <- margins::marginal_effects(model, data=data)
    }
    else {
      me <- margins::marginal_effects(model)
    }
    # For some reason, str_replace garbles column names generated from Date column with Japanese name. Using gsub instead to avoid the issue.
    # term <- stringr::str_replace(names(me), "^dydx_", "")
    term <- gsub("^dydx_", "", names(me))
    ame <- purrr::flatten_dbl(purrr::map(me, function(x){mean(x, na.rm=TRUE)}))
    ret <- data.frame(term=term, ame=ame)
    ret
  }
}

# VIF calculation definition from car::vif.
# Copied to avoid having to import many dependency packages.
vif <- function(mod, ...) {
    if (any(is.na(coef(mod)))) 
        stop ("there are aliased coefficients in the model")
    v <- vcov(mod)
    assign <- attr(model.matrix(mod), "assign")
    if (names(coefficients(mod)[1]) == "(Intercept)") {
        v <- v[-1, -1]
        assign <- assign[-1]
    }
    else warning("No intercept: vifs may not be sensible.")
    terms <- labels(terms(mod))
    n.terms <- length(terms)
    if (n.terms < 2) stop("model contains fewer than 2 terms")
    R <- cov2cor(v)
    detR <- det(R)
    result <- matrix(0, n.terms, 3)
    rownames(result) <- terms
    colnames(result) <- c("GVIF", "Df", "GVIF^(1/(2*Df))")
    for (term in 1:n.terms) {
        subs <- which(assign == term)
        result[term, 1] <- det(as.matrix(R[subs, subs])) *
            det(as.matrix(R[-subs, -subs])) / detR
        result[term, 2] <- length(subs)
    }
    if (all(result[, 2] == 1)) result <- result[, 1]
    else result[, 3] <- result[, 1]^(1/(2 * result[, 2]))
    result
}

# Calculate VIF and throw user friendly message in case of perfect collinearity.
calc_vif <- function(model, terms_mapping) {
  tryCatch({
    vif(model)
  }, error = function(e){
    # in case of perfect multicollinearity, vif throws error with message "there are aliased coefficients in the model".
    # Check if it is the case. If coef() includes NA, corresponding variable is causing perfect multicollinearity.
    coef_vec <- coef(model)
    na_coef_vec <- coef_vec[is.na(coef_vec)]
    if (length(na_coef_vec) > 0) {
      na_coef_names <- names(na_coef_vec)
      na_coef_names <- map_terms_to_orig(na_coef_names, terms_mapping) # Map column names in the message back to original.
      message <- paste(na_coef_names, collapse = ", ")
      message <- paste0("Variables causing perfect collinearity : ", message)
      e$message <- message
    }
    stop(e)
  })
}

# augment function just to filter out unknown categories in predictors to avoid error.
augment.lm_exploratory_0 <- function(x, data = NULL, newdata = NULL, ...) {
  if (!is.null(newdata) && length(x$xlevels) > 0) {
    for (i in 1:length(x$xlevels)) {
      newdata <- newdata %>% dplyr::filter(!!rlang::sym(names(x$xlevels)[[i]]) %in% !!x$xlevels[[i]])
    }
  }
  if (is.null(data)) { # Giving data argument when it is NULL causes error from augment.glm. Do the same for lm just in case.
    ret <- broom:::augment.lm(x, newdata = newdata, se=TRUE, ...)
    if (!is.null(ret$.rownames)) { # Clean up .rownames column augment.glm adds for some reason. Do the same for lm just in case.
      ret$.rownames <- NULL
    }
  }
  else {
    ret <- broom:::augment.lm(x, data = data, newdata = newdata, se=TRUE, ...)
  }
  ret
}

tidy.lm_exploratory_0 <- function(x, ...) {
  # tidy.lm raises error when class has more than "lm". Working it around here.
  orig_class <- class(x)
  class(x) <- "lm"
  ret <- broom:::tidy.lm(x, ...)
  class(x) <- orig_class
  ret
}

#' lm wrapper with do
#' @return deta frame which has lm model
#' @param data Data frame to be used as data
#' @param formula Formula for lm
#' @param ... Parameters to be passed to lm function
#' @param keep.source Whether source should be kept in source.data column
#' @param augment Whether the result should be augmented immediately
#' @param group_cols A vector with columns names to be used as group columns
#' @param test_rate Ratio of test data
#' @param seed Random seed to control test data sampling
#' @export
build_lm <- function(data, formula, ..., keep.source = TRUE, augment = FALSE, group_cols = NULL, test_rate = 0.0, seed = 1){
  validate_empty_data(data)

  # make variables factor sorted by the frequency
  fml_vars <- all.vars(formula)
  for(var in fml_vars) {
    if(is.character(data[[var]])){
      data[[var]] <- forcats::fct_infreq(data[[var]])
    }
  }

  if(!is.null(seed)){
    set.seed(seed)
  }

  # deal with group columns by index because those names might be changed
  group_col_index <- colnames(data) %in% group_cols

  # change column names to avoid name conflict when tidy or glance are executed
  reserved_names <- c(
    "model", ".test_index", "data", ".model_metadata",
    # for tidy
    "term", "estimate", "std.error", "statistic", "p.value",
    # for glance
    "r.squared", "adj.r.squared", "sigma", "statistic", "p.value",
    "df", "logLik", "AIC", "BIC", "deviance", "df.residual"
  )


  if(test_rate < 0 | 1 < test_rate){
    stop("test_rate must be between 0 and 1")
  } else if (test_rate == 1){
    stop("test_rate must be less than 1")
  }

  colnames(data)[group_col_index] <- avoid_conflict(
    reserved_names,
    colnames(data)[group_col_index],
    ".group"
  )

  # make column names unique
  colnames(data) <- make.unique(colnames(data), sep = "")

  if(!is.null(group_cols)){
    data <- dplyr::group_by(data, !!!rlang::syms(colnames(data)[group_col_index]))
  }

  # Filter out NA and Inf from target variable.
  target_cols <- all.vars(lazyeval::f_lhs(formula))
  for (target_col in target_cols) {
    data <- data %>%
      dplyr::filter(!is.na(!!rlang::sym(target_col)) & !is.infinite(!!rlang::sym(target_col)))
  }

  group_col_names <- grouped_by(data)

  # check if grouping columns are in the formula
  grouped_var <- group_col_names[group_col_names %in% fml_vars]
  if (length(grouped_var) == 1) {
    stop(paste0(grouped_var, " is a grouping column. Please remove it from variables."))
  } else if (length(grouped_var) > 0) {
    stop(paste0(paste(grouped_var, collapse = ", "), " are grouping columns. Please remove them from variables."))
  }

  model_col <- "model"
  source_col <- "source.data"

  caller <- match.call()
  # this expands dots arguemtns to character
  arg_char <- expand_args(caller, exclude = c("data", "keep.source", "augment", "group_cols", "test_rate", "seed"))

  ret <- tryCatch({
    ret <- data %>%
      tidyr::nest(source.data=-dplyr::group_cols()) %>%
      # create test index
      dplyr::mutate(.test_index = purrr::map(source.data, function(df){
        sample_df_index(df, rate = test_rate)
      })) %>%
      # slice training data
      dplyr::mutate(model = purrr::map2(source.data, .test_index, function(df, index){
        data <- safe_slice(df, index, remove = TRUE)

        # execute lm with parsed arguments
        model <- eval(parse(text = paste0("stats::lm(data = data, ", arg_char, ")")))

        # Strip environments to save rds size when cached.
        if (!is.null(model$terms)) {
          attr(model$terms,".Environment")<-NULL
        }
        if (!is.null(model$model) && !is.null(attr(model$model,"terms"))) {
          attr(attr(model$model,"terms"),".Environment") <- NULL
        }

        class(model) <- c("lm_exploratory_0", class(model))
        model
      })) %>%
      dplyr::mutate(.model_metadata = purrr::map(source.data, function(df){
        if(!is.null(formula)){
          create_model_meta(df, formula)
        } else {
          list()
        }
      }))
    if(!keep.source & !augment){
      ret <- dplyr::select(ret, -source.data)
    } else {
      class(ret[[source_col]]) <- c("list", ".source.data")
    }
    ret <- dplyr::rowwise(ret)
    ret
  }, error = function(e){
    # Error message was changed when upgrading dplyr to 0.7.1
    # so use stringr::str_detect to make these robust.
    # With dplyr 1.0.8, it seems that the message is separated into e$message and e$parant$message.
    # With dplyr 1.0.10, it seems that the message is further separated into e$message, e$parant$message, and e$parent$parent$message.
    # First extract the root message text.
    if (!is.null(e$parent)) {
      if (!is.null(e$parent$parent)) {
        message <- e$parent$parent$message
      }
      else {
        message <- e$parent$message
      }
    }
    else { # Handling for before dplyr 1.0.8. (With 6.9.5 we do not bundle dplyr 1.0.8 yet.)
      message <- e$message
    }
    # Run text match on the extracted message.
    if (stringr::str_detect(message, "contrasts can be applied only to factors with 2 or more levels")) {
      stop("more than 1 unique values are expected for categorical columns assigned as predictors")
    }
    if(stringr::str_detect(message, "0 \\(non\\-NA\\) cases")){
      stop("no data after removing NA")
    }
    stop(message)
  })
  if(augment){
    if(test_rate == 0){
      ret <- prediction(ret, data = "training")
    } else {
      ret <- prediction(ret, data = "test")
    }
  } else {
    class(ret[[model_col]]) <- c("list", ".model", ".model.lm")
  }
  ret
}

# Map terms back to the original variable names.
# If term is for categorical variable, e.g. c1_UA,
# it will be mapped to name like "Carrier: UA".
map_terms_to_orig <- function(terms, mapping) {
  # Extract name part and value part of the coefficient name separately.
  var_names <- stringr::str_extract(terms, "^c[0-9]+_")
  var_values <- stringr::str_remove(terms, "^c[0-9]+_")
  var_names_orig <- mapping[var_names] # Map variable names back to the original.
  # is.na(var_names) is for "(Intercept)".
  # var_values == "" is for numerical variables.
  # Categorical variables are expected to have var_values.
  ret <- dplyr::if_else(is.na(var_names), terms, dplyr::if_else(var_values == "", var_names_orig, paste0(var_names_orig, ": ", var_values)))
  ret
}

#' Common preprocessing of regression data to be done BEFORE sampling.
#' Only common operations to be done, for example, in Summary View too.
#' @export
preprocess_regression_data_before_sample <- function(df, target_col, predictor_cols, filter_predictor_numeric_na=TRUE) {
  # ranger and rpart works with NA or Inf in the target, but we decided it would build rather meaningless or biased model.
  # For example, rpart binary classification just replaces NAs with FALSE, which would change the meaning of data inadvertently.
  # lm will filter out NAs anyway internally, and errors out if the remaining rows are with single value in any predictor column.
  # Also, filter Inf/-Inf too to avoid error at lm.
  # So, we always filter out NA/Inf from target variable.
  # NOTE: This has to be done before removing of all-NA predictor columns, since this operation might make new all-NA predictor columns.
  df <- df %>%
    dplyr::filter(!is.na(!!rlang::sym(target_col)) & !is.infinite(!!rlang::sym(target_col))) # this form does not handle group_by. so moved into each_func from outside.

  # Remove all-NA-or-Inf columns.
  # NOTE: This has to be done bofore filtering predictor numeric NAs. Otherwise, all the rows could be filtered out.
  cols <- predictor_cols
  for (col in predictor_cols) {
    if(all(is.na(df[[col]]) | is.infinite(df[[col]]))){
      # remove columns if they are all NA or Inf
      cols <- setdiff(cols, col)
      df[[col]] <- NULL # drop the column so that SMOTE will not see it. 
    }
  }
  if (length(cols) == 0) {
    stop("No predictor column is left after removing columns with only NA or Inf values.")
  }

  # To avoid unused factor level that causes margins::marginal_effects() to fail, filtering operation has
  # to be done before factor level adjustments.
  # This is done before sampling so that we will end up with more valid rows in the end.
  for(col in cols){
    if(is.numeric(df[[col]]) || lubridate::is.Date(df[[col]]) || lubridate::is.POSIXct(df[[col]])) {
      # For numeric cols, filter NA rows, because lm will anyway do this internally, and errors out
      # if the remaining rows are with single value in any predictor column.
      # Filter Inf/-Inf too to avoid error at lm.
      # Do the same for Date/POSIXct, because we will create numeric columns from them.

      # For rpart, filter NAs for numeric columns to avoid instability. It seems that the resulting tree from rpart sometimes becomes
      # simplistic (e.g. only one split in the tree), especially in Exploratory for some reason, if we let rpart handle the handling of NAs,
      # even though it is supposed to just filter out rows with NAs, which is same as what we are doing here.
      if (filter_predictor_numeric_na) {
        df <- df %>% dplyr::filter(!is.na(!!rlang::sym(col)) & !is.infinite(!!rlang::sym(col)))
      }
      else {
        # For ranger, removing numeric NA is not necessary.
        # But even for ranger, filter Inf/-Inf to avoid following error from ranger.
        # Error in seq.default(min(x, na.rm = TRUE), max(x, na.rm = TRUE), length.out = length.out) : 'from' must be a finite number
        # TODO: In exp_rpart and calc_feature_imp, we have logic to remember and restore NA rows, but they are probably not made use of
        # if we filter NA rows here.
        df <- df %>% dplyr::filter(!is.na(!!rlang::sym(col)))
      }
    }
  }
  if (nrow(df) == 0) {
    stop("No row is left after removing NA/Inf from numeric, Date, or POSIXct columns.")
  }
  attr(df, 'predictors') <- cols
  df
}

#' Common preprocessing of regression data to be done AFTER sampling.
#' Only common operations to be done, for example, in Summary View too.
#' @export
#' @param df
#' @param target_col
#' @param predictor_cols
#' @param predictor_n
#' @param name_map
#' @param other_level - Level for "Other" group. Default is "Other".
preprocess_regression_data_after_sample <- function(df, target_col, predictor_cols,
                                                    predictor_n = 12, # so that at least months can fit in it.
                                                    name_map = NULL, 
                                                    other_level = "Other") {
  c_cols <- predictor_cols
  for(col in predictor_cols){
    if(lubridate::is.Date(df[[col]]) || lubridate::is.POSIXct(df[[col]])) {
      c_cols <- setdiff(c_cols, col)

      absolute_time_col <- avoid_conflict(colnames(df), paste0(col, "_abs_time"))
      wday_col <- avoid_conflict(colnames(df), paste0(col, "_w_"))
      day_col <- avoid_conflict(colnames(df), paste0(col, "_day_of_month"))
      yday_col <- avoid_conflict(colnames(df), paste0(col, "_day_of_year"))
      month_col <- avoid_conflict(colnames(df), paste0(col, "_m_"))
      year_col <- avoid_conflict(colnames(df), paste0(col, "_year"))
      new_name <- c(absolute_time_col, wday_col, day_col, yday_col, month_col, year_col)
      names(new_name) <- paste(
        names(name_map)[name_map == col],
        c(
          "_abs_time",
          "_w_",
          "_day_of_month",
          "_day_of_year",
          "_m_",
          "_year"
        ), sep="")

      name_map <- c(name_map, new_name)

      c_cols <- c(c_cols, absolute_time_col, wday_col, day_col, yday_col, month_col, year_col)
      df[[absolute_time_col]] <- as.numeric(df[[col]])
      # turn it into unordered factor since if it is ordered factor,
      # lm/glm takes polynomial terms (Linear, Quadratic, Cubic, and so on) and use them as variables,
      # which we do not want for this function.
      # Reference: https://hlplab.wordpress.com/2008/01/28/the-mysterious-l-q-and-c/
      df[[wday_col]] <- factor(lubridate::wday(df[[col]], label=TRUE), ordered=FALSE)
      # lubridate::day returns integer.
      # Convert integer to numeric. mmpf::marginalPrediction we use for partial dependence throws assertion error, if the data is integer and specified grid points are not integer.
      df[[day_col]] <- as.numeric(lubridate::day(df[[col]]))
      df[[yday_col]] <- lubridate::yday(df[[col]])
      # turn it into unordered factor for the same reason as wday.
      df[[month_col]] <- factor(lubridate::month(df[[col]], label=TRUE), ordered=FALSE)
      df[[year_col]] <- lubridate::year(df[[col]])
      if(lubridate::is.POSIXct(df[[col]])) {
        hour_col <- avoid_conflict(colnames(df), paste0(col, "_hour"))
        new_name <- c(hour_col)
        names(new_name) <- paste(
          names(name_map)[name_map == col],
          c(
            "_hour"
          ), sep="")
        name_map <- c(name_map, new_name)

        c_cols <- c(c_cols, hour_col)
        df[[hour_col]] <- factor(lubridate::hour(df[[col]])) # treat hour as category
      }
      df[[col]] <- NULL # drop original Date/POSIXct column to pass SMOTE later.
    } else if(is.factor(df[[col]])) {
      df[[col]] <- forcats::fct_drop(df[[col]]) # margins::marginal_effects() fails if unused factor level exists. Drop them to avoid it.
      # 1. if the data is factor, respect the levels and keep first 10 levels, and make others "Others" level.
      # 2. if the data is ordered factor, turn it into unordered. For ordered factor,
      #    lm/glm takes polynomial terms (Linear, Quadratic, Cubic, and so on) and use them as variables,
      #    which we do not want for this function.
      if (length(levels(df[[col]])) > predictor_n + 1) { # +1 is for 'Others' group.
        df[[col]] <- forcats::fct_other(factor(df[[col]], ordered=FALSE), keep=levels(df[[col]])[1:predictor_n], other_level=other_level)
      }
      else {
        df[[col]] <- factor(df[[col]], ordered=FALSE)
      }
      # turn NA into (Missing) factor level. Without this, lm or glm drops rows internally.
      df[[col]] <- forcats::fct_explicit_na(df[[col]])
    } else if(is.logical(df[[col]])) {
      # 1. convert data to factor if predictors are logical. (as.factor() on logical always puts FALSE as the first level, which is what we want for predictor.)
      # 2. turn NA into (Missing) factor level so that lm will not drop all the rows.
      # For ranger, we need to convert logical to factor too since na.roughfix only works for numeric or factor.
      df[[col]] <- forcats::fct_explicit_na(as.factor(df[[col]]))
    } else if(!is.numeric(df[[col]])) {
      # 1. convert data to factor if predictors are not numeric or logical
      #    and limit the number of levels in factor by fct_lump.
      #    we use ties.method to handle the case where there are many unique values. (without it, they all survive fct_lump.)
      #    TODO: see if ties.method would make sense for calc_feature_imp.
      # 2. turn NA into (Missing) factor level so that lm will not drop all the rows.
      df[[col]] <- forcats::fct_explicit_na(forcats::fct_lump(forcats::fct_infreq(as.factor(df[[col]])), n=predictor_n, ties.method="first", other_level=other_level))
    } else if(is.integer(df[[col]])) {
      # Convert integer to numeric. mmpf::marginalPrediction we use for partial dependence throws assertion error, if the data is integer and specified grid points are not integer.
      # To avoid something like that, we just convert integer to numeric before building predictive models.
      df[[col]] <- as.numeric(df[[col]])
    }
  }
  # If target is factor, turn it into unordered factor if it is not already.
  if (is.factor(df[[target_col]])) {
    df[[target_col]] <- factor(df[[target_col]], ordered=FALSE)
  }
  # remove columns with only one unique value
  cols_copy <- c_cols
  for (col in cols_copy) {
    unique_val <- unique(df[[col]])
    if (length(unique_val[!is.na(unique_val)]) == 1) {
      c_cols <- setdiff(c_cols, col)
      df[[col]] <- NULL # drop the column so that SMOTE will not see it. 
    }
  }
  if (length(c_cols) == 0) {
    # Previous version of message - stop("The selected predictor variables are invalid since they have only one unique values.")
    stop("Invalid Predictors: Only one unique value.") # Message is made short so that it fits well in the Summary table.
  }
  attr(df, 'predictors') <- c_cols # Pass up list of updated predictor column names.
  attr(df, 'name_map') <- name_map # Pass up list of updated column name map. Just for legacy Date/POSIXct column handling. 
  df
}

remove_outliers_for_regression_data <- function(df, target_col, predictor_cols,
                                                target_outlier_filter_type,
                                                target_outlier_filter_threshold,
                                                predictor_outlier_filter_type,
                                                predictor_outlier_filter_threshold) {
  if (!is.null(target_outlier_filter_type) || !is.null(predictor_outlier_filter_type)) {
    df$.is.outlier <- FALSE #TODO: handle possibility of name conflict.
    if (!is.null(target_outlier_filter_type)) {
      is_outlier <- function(x) {
        res <- detect_outlier(x, type=target_outlier_filter_type, threshold=target_outlier_filter_threshold) %in% c("Lower", "Upper")
        res
      }
      if (is.numeric(df[[target_col]])) {
        df$.is.outlier <- df$.is.outlier | is_outlier(df[[target_col]])
      }
    }

    if (!is.null(predictor_outlier_filter_type)) {
      is_outlier <- function(x) {
        res <- detect_outlier(x, type=predictor_outlier_filter_type, threshold=predictor_outlier_filter_threshold) %in% c("Lower", "Upper")
        res
      }
      for (col in predictor_cols) {
        if (is.numeric(df[[col]])) {
          df$.is.outlier <- df$.is.outlier | is_outlier(df[[col]])
        }
      }
    }
    df <- df %>% dplyr::filter(!.is.outlier)
    df$.is.outlier <- NULL # Removing the temporary column.
  }
  df
}

#' builds lm model quickly for analytics view.
#' @param seed - Random seed to control data sampling, SMOTE, and bootstrapping for confidence interval of relative importance.
#' @param test_rate Ratio of test data
#' @export
build_lm.fast <- function(df,
                    target,
                    ...,
                    target_fun = NULL,
                    predictor_funs = NULL,
                    weight = NULL,
                    weight_fun = NULL,
                    model_type = "lm",
                    family = NULL,
                    link = NULL,
                    max_nrow = 50000,
                    predictor_n = 12, # so that at least months can fit in it.
                    normalize_target = FALSE,
                    normalize_predictors = FALSE,
                    target_outlier_filter_type = NULL,
                    target_outlier_filter_threshold = NULL,
                    predictor_outlier_filter_type = NULL,
                    predictor_outlier_filter_threshold = NULL,
                    smote = FALSE,
                    smote_target_minority_perc = 40,
                    smote_max_synth_perc = 200,
                    smote_k = 5,
                    importance_measure = "permutation", # "firm", or "permutation" if available. Defaulting to permutation for backward compatibility for pre-6.5.
                    with_marginal_effects = FALSE,
                    with_marginal_effects_confint = FALSE,
                    variable_metric = NULL,
                    p_value_threshold = 0.05,
                    max_pd_vars = 20,
                    pd_sample_size = 500,
                    pd_grid_resolution = 20,
                    seed = 1,
                    test_rate = 0.0,
                    test_split_type = "random" # "random" or "ordered"
                    ){
  target_col <- tidyselect::vars_select(names(df), !! rlang::enquo(target))
  orig_selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...))

  target_funs <- NULL
  if (!is.null(target_fun)) {
    target_funs <- list(target_fun)
    names(target_funs) <- target_col
    df <- df %>% mutate_predictors(target_col, target_funs)
  }

  if (!is.null(predictor_funs)) {
    df <- df %>% mutate_predictors(orig_selected_cols, predictor_funs)
    selected_cols <- names(unlist(predictor_funs))
  }
  else {
    selected_cols <- orig_selected_cols
  }

  weight_col <- tidyselect::vars_select(names(df), !! rlang::enquo(weight))
  if (is.null(weight_col) || length(weight_col) == 0) { # It seems that if weight_col is not specified weight_col can be named character(0), which can be detected by length(weight_col)=0.
    weight_col <- NULL
  }
  if (!is.null(weight_col) && !is.null(weight_fun)) {
    weight_funs <- list(weight_fun)
    names(weight_funs) <- weight_col
    df <- df %>% mutate_predictors(weight_col, weight_funs)
  }
  if (!is.null(weight_col) && min(df[[weight_col]], na.rm=TRUE) <= 0) {
    # We enforce strict positive values because with weights including 0, the model throws error at prediction.
    stop("EXP-ANA-10 :: [] :: Weight column must be positive.")
  }
  if (!is.null(weight_col)) {
    # Fill NA weight with 1.
    df <- df %>% dplyr::mutate(!!rlang::sym(weight_col) := ifelse(is.na(!!rlang::sym(weight_col)), 1, !!rlang::sym(weight_col)))
  }

  grouped_cols <- grouped_by(df)

  if (!is.null(variable_metric)  && variable_metric == "ame") { # Special argument for integration with Analytics View.
    with_marginal_effects <- TRUE
  }

  if (model_type == "glm" && is.null(family)) {
    family <- "binomial" # default for glm is logistic regression.
    link <- "logit"
  }

  # For backward compatibility, importance_measure defaults to "permutation",
  # but these GLMs do not have permutation importance support. Fall back to FIRM.
  if (importance_measure == "permutation" &&
      model_type == "glm" && family %in% c("Gamma", "negativebinomial", "inverse.gaussian")) {
    importance_measure <- "firm"
  }


  if (model_type == "glm" && family == "binomial" && (is.null(link) || link == "logit")) {
    if (!is.logical(df[[target_col]]) && !is.numeric(df[[target_col]])) {
      # numeric still works, but our official guidance will be to use logical.
      stop("Target variable for logistic regression must be a logical.")
    }
  }

  # If we do permutation importance, sort predictors so that the result of it is stable against change of column order.
  # Otherwise, avoid sorting so that user has control over the order of variables on partial dependence plot.
  if (model_type == "lm" || family %in% c("binomial", "gaussian") || (family == "poisson" && (is.null(link) || link == "log"))) {
    selected_cols <- stringr::str_sort(selected_cols)
  }

  if(test_rate < 0 | 1 < test_rate){
    stop("test_rate must be between 0 and 1")
  } else if (test_rate == 1){
    stop("test_rate must be less than 1")
  }

  # drop unrelated columns so that SMOTE later does not have to deal with them.
  # select_ was not able to handle space in target_col. let's do it in base R way.
  df <- df[,colnames(df) %in% c(grouped_cols, selected_cols, target_col, weight_col), drop=FALSE]

  # Remove grouped col or target col. Weight col is not removed because it can be one of the predictors.
  selected_cols <- setdiff(selected_cols, c(grouped_cols, target_col))

  if (any(c(target_col, selected_cols, weight_col) %in% grouped_cols)) {
    stop("grouping column is used as variable columns")
  }

  if (predictor_n < 2) {
    stop("Max # of categories for explanatory vars must be at least 2.")
  }

  orig_levels <- NULL # For recording original factor levels for binary classification, to show data size for each class in Summary View.
  if (!is.null(model_type) && model_type == "glm" && family %in% c("binomial", "quasibinomial")) {
    # binomial case.
    unique_val <- unique(df[[target_col]])
    if (length(unique_val[!is.na(unique_val)]) != 2) {
      stop(paste0("Column to predict (", target_col, ") with Binomial Regression must have 2 unique values."))
    }
    if (!is.numeric(df[[target_col]]) && !is.factor(df[[target_col]]) && !is.logical(df[[target_col]])) {
      # make other types factor so that it passes stats::glm call.
      df[[target_col]] <- factor(df[[target_col]])
    }
    # record original factor levels.
    if (is.factor(df[[target_col]])) {
      orig_levels <- levels(df[[target_col]])
      # Keep only the ones that actually are used.
      # One with larger index seems to be treated as TRUE (i.e. 1 in model$y) by glm.
      orig_levels <- orig_levels[orig_levels %in% unique_val]
    }
    else if (is.logical(df[[target_col]])) {
      orig_levels <- c("FALSE","TRUE")
    }
  }
  else { # this means the model is lm or glm with family other than binomial
    if (!is.numeric(df[[target_col]])) {
      # TODO: message should handle other than lm too.
      stop(paste0("Column to predict (", target_col, ") with Linear Regression must be numeric"))
    }
  }

  # Replace spaces with dots in column names. margins::marginal_effects() fails without it.
  clean_df <- df
  # Replace column names with names like c1_, c2_...
  # _ is so that name part and value part of categorical coefficient can be separated later,
  # even with values that starts with number like "9E".
  # We avoid following known issues by this.
  # - Column name issue for marginal_effects(). Space is not handled well.
  #   ()"' are known to be ok as of version 5.5.2.
  # - Column name issue for mmpf::marginalPrediction (for partial dependence). Comma is not handled well.
  # Note that the data frames in source_data column are with the replaced column names for simplicity,
  # rather than the original column names, unlike what we do for ranger or rpart. (Maybe we should do it for ranger or rpart.)
  # The reverse mapping of the column names are done in augment.lm_exploratory or augment.glm_exploratory.
  names(clean_df) <- paste0("c",1:length(colnames(clean_df)), "_")
  # this mapping will be used to restore column names
  name_map <- colnames(clean_df)
  names(name_map) <- colnames(df)

  # clean_names changes column names
  # without chaning grouping column name
  # information in the data frame
  # and it causes an error,
  # so the value of grouping columns
  # should be still the names of grouping columns
  name_map[grouped_cols] <- grouped_cols
  colnames(clean_df) <- name_map

  clean_target_col <- name_map[target_col]
  if (!is.null(weight_col)) {
    clean_weight_col <- name_map[weight_col]
  }
  else {
    clean_weight_col <- NULL
  }
  clean_cols <- name_map[selected_cols]

  each_func <- function(df) {
    tryCatch({
      if(!is.null(seed)){
        set.seed(seed)
      }

      df_test <- NULL # declare variable for test data

      df <- preprocess_regression_data_before_sample(df, clean_target_col, clean_cols)
      clean_cols <- attr(df, 'predictors') # predictors are updated (removed) in preprocess_pre_sample. Catch up with it.

      # Sample the data because randomForest takes long time if data size is too large.
      # If we are to do SMOTE, do not down sample here and let exp_balance handle it so that we do not sample out precious minority data.
      sampled_nrow <- NULL
      if (!smote) {
        if (!is.null(max_nrow) && nrow(df) > max_nrow) {
          # Record that sampling happened.
          sampled_nrow <- max_nrow
          df <- df %>% sample_rows(max_nrow)
        }
      }

      # Remove outliers if specified so.
      # This has to be done before preprocess_regression_data_after_sample, since it can remove rows and reduce number of unique values,
      # just like sampling.
      df <- remove_outliers_for_regression_data(df, clean_target_col, clean_cols,
                                                target_outlier_filter_type,
                                                target_outlier_filter_threshold,
                                                predictor_outlier_filter_type,
                                                predictor_outlier_filter_threshold)

      # Capture the classes of the columns at this point before preprocess_regression_data_after_sample,
      # so that we know the original classes of columns before characters are turned into factors,
      # so that we can sort the partial dependence data for display accordingly.
      # preprocess_regression_data_after_sample can remove columns, but it should not cause problem that we have more columns in
      # orig_predictor_classes than the partial dependence data.
      # Also, preprocess_regression_data_after_sample has code to add columns extracted from Date/POSIXct, but with recent releases,
      # that should not happen, since the extraction is already done by mutate_predictors.
      orig_predictor_classes <- capture_df_column_classes(df, clean_cols)
      df <- preprocess_regression_data_after_sample(df, clean_target_col, clean_cols, predictor_n = predictor_n, name_map = name_map)
      c_cols <- attr(df, 'predictors') # predictors are updated (added and/or removed) in preprocess_post_sample. Catch up with it.
      name_map <- attr(df, 'name_map')

      # Normalize numeric target variable,
      # after all column changes for Date/POSIXct, filtering, dropping columns above are done.
      if (normalize_target) {
        if (is.numeric(df[[clean_target_col]])) {
          df[[clean_target_col]] <- normalize(df[[clean_target_col]])
        }
      }
      # Normalize numeric predictors so that resulting coefficients are comparable among them,
      # after all column changes for Date/POSIXct, filtering, dropping columns above are done.
      if (normalize_predictors) {
        for (col in c_cols) {
          if (is.numeric(df[[col]])) {
            df[[col]] <- normalize(df[[col]])
          }
        }
      }

      # Reverse mapping of variable names.
      terms_mapping <- names(name_map)
      names(terms_mapping) <- name_map

      # build formula for lm
      rhs <- paste0("`", c_cols, "`", collapse = " + ")
      # TODO: This clean_target_col is actually not a cleaned column name since we want lm to show real name. Clean up our variable name.
      fml <- as.formula(paste0("`", clean_target_col, "` ~ ", rhs))
      if (model_type == "glm") {
        if (smote) {
          if (with_marginal_effects) {
            # Keep the version of data before SMOTE,
            # since we want to know average marginal effect on a data that has
            # close distribution to the original data.
            df_before_smote <- df
          }
          # Note: If there is weight column, we synthesize weight column as well.
          df <- df %>% exp_balance(clean_target_col, target_size = max_nrow, target_minority_perc = smote_target_minority_perc, max_synth_perc = smote_max_synth_perc, k = smote_k)
          df <- df %>% dplyr::select(-synthesized) # Remove synthesized column added by exp_balance(). TODO: Handle it better. We might want to show it in resulting data.
          for(col in names(df)){
            if(is.factor(df[[col]])) {
              # margins::marginal_effects() fails if unused factor level exists. Drop them to avoid it.
              # In case of SMOTE, this has to be done after that. TODO: Do this just once in any case.
              df[[col]] <- forcats::fct_drop(df[[col]])
              if (with_marginal_effects) {
                df_before_smote <- df_before_smote %>% dplyr::filter(!!rlang::sym(col) %in% levels(df[[col]]))
                df_before_smote[[col]] <- forcats::fct_drop(df_before_smote[[col]])
              }
            }
          }
          if (with_marginal_effects) {
            # Sample df_before_smote for speed.
            # We do not remove imbalance here to keep close distribution to the original data.
            df_before_smote <- df_before_smote %>% sample_rows(max_nrow)
          }
        }

        # split training and test data
        source_data <- df
        test_index <- sample_df_index(source_data, rate = test_rate, ordered = (test_split_type == "ordered"))
        df <- safe_slice(source_data, test_index, remove = TRUE)
        if (test_rate > 0) {
          # Test mode. Make prediction with test data here, rather than repeating it in Analytics View preprocessors.
          df_test <- safe_slice(source_data, test_index, remove = FALSE)
        }

        # When link is not specified, use default link function for each family,
        # except for the case where family is negativebinomial, which we should use MASS::glm.nb
        if (is.null(link) && family != "negativebinomial") {
          if (is.null(clean_weight_col)) {
            model <- stats::glm(fml, data = df, family = family) 
          }
          else {
            model <- stats::glm(fml, data = df, family = family, weights=df[[clean_weight_col]])
          }
        }
        else {
          if (family == "gaussian") {
            family_arg <- stats::gaussian(link=link)
          }
          else if (family == "binomial") {
            family_arg <- stats::binomial(link=link)
          }
          else if (family == "Gamma") {
            family_arg <- stats::Gamma(link=link)
          }
          else if (family == "inverse.gaussian") {
            family_arg <- stats::inverse.gaussian(link=link)
          }
          else if (family == "poisson") {
            family_arg <- stats::poisson(link=link)
          }
          else if (family == "quasi") {
            family_arg <- stats::quasi(link=link)
          }
          else if (family == "quasibinomial") {
            family_arg <- stats::quasibinomial(link=link)
          }
          else if (family == "quasipoisson") {
            family_arg <- stats::quasipoisson(link=link)
          }
          else if (family == "negativebinomial") {
            family_arg <- family
          }
          else { # default gaussian
            family_arg <- stats::gaussian(link=link)
          }

          if (family == "negativebinomial"){
            # In MASS::glm.nb function, the link arg must be one of log, sqrt or identity.
            # So if the other is used, the link arg should be set to "log" which is the default value.
            if (is.null(link) || (!link %in% c("log", "sqrt", "identity"))){
              link <- "log"
            }

            if (dplyr::n_distinct(df[[clean_target_col]]) == 1) {
              # If only 1 unique value is there in target column, glm.nb seems to return error like following.
              # Error in while ((it <- it + 1) < limit && abs(del) > eps) { : 
              # missing value where TRUE/FALSE needed
              stop("Target column has only one unique value.")
            }
            # The argument link in MASS::glm.nb is evaluated by substitution with delay,
            # so the variable specified in the argument is interpreted as the link argument as it is.
            # For example, if you execute like MASS::glm.nb(fmt, data = df, link = link), the following error will occur
            # link "link" not available for poisson family; available links are 'log', 'identity', 'sqrt'
            # Therefore, we used eval to pass the string (log etc.) specified in the argument to link as it is.
            if (is.null(clean_weight_col)) {
              model <- eval(parse(text=paste0("MASS::glm.nb(fml, data=df, link=", link, ")")))
            }
            else {
              model <- eval(parse(text=paste0("MASS::glm.nb(fml, data=df, link=", link, ", weights=df[[clean_weight_col]])")))
            }

            # A model by MASS::glm.nb has not a formula attribute.
            model$formula <- fml
          } else {
            if (is.null(clean_weight_col)) {
              model <- stats::glm(fml, data = df, family = family_arg) 
            }
            else {
              model <- stats::glm(fml, data = df, family = family_arg, weights=df[[clean_weight_col]])
            }
          }
        }
        if (length(c_cols) > 1) { # Skip importance calculation if there is only one variable.
          if (importance_measure == "permutation") { # For firm case, we need to first calculate partial dependence.
            if (family == "binomial") {
              model$imp_df <- calc_permutation_importance_binomial(model, clean_target_col, c_cols, df)
            }
            else if (family == "poisson" && (is.null(link) || link == "log")) {
              model$imp_df <- calc_permutation_importance_poisson(model, clean_target_col, c_cols, df)
            }
            else if (family == "gaussian") {
              model$imp_df <- calc_permutation_importance_gaussian(model, clean_target_col, c_cols, df)
            }
          }
        }
        else {
          error <- simpleError("Variable importance requires two or more variables.")
          model$imp_df <- error
        }
      }
      else { # model_type == "lm"
        # split training and test data
        source_data <- df
        test_index <- sample_df_index(source_data, rate = test_rate, ordered = (test_split_type == "ordered"))
        df <- safe_slice(source_data, test_index, remove = TRUE)
        if (test_rate > 0) {
          df_test <- safe_slice(source_data, test_index, remove = FALSE)
        }
        if (is.null(clean_weight_col)) {
          model <- stats::lm(fml, data = df)
        }
        else {
          model <- stats::lm(fml, data = df, weights=df[[clean_weight_col]]) 
        }
        if (length(c_cols) > 1) { # Skip importance calculation if there is only one variable.
          if (importance_measure == "permutation") { # For firm case, we need to first calculate partial dependence.
            model$imp_df <- calc_permutation_importance_linear(model, clean_target_col, c_cols, df)
          }
        }
        else {
          error <- simpleError("Variable importance requires two or more variables.")
          model$imp_df <- error
        }
      }

      # Strip environments to save rds size when cached.
      if (!is.null(model$terms)) {
        attr(model$terms,".Environment")<-NULL
      }
      if (!is.null(model$formula)) {
        attr(model$formula,".Environment")<-NULL
      }
      if (!is.null(model$model) && !is.null(attr(model$model,"terms"))) {
        attr(attr(model$model,"terms"),".Environment") <- NULL
      }

      tryCatch({
        model$vif <- calc_vif(model, terms_mapping)
      }, error = function(e){
        model$vif <<- e
      })

      if (test_rate > 0) {
        # Remove rows with categorical values which does not appear in training data and unknown to the model.
        # Record where it was in unknown_category_rows_index, and keep it with model, so that prediction that
        # matches with original data can be generated later.
        df_test_clean <- cleanup_df_for_test(df_test, df, c_cols)
        unknown_category_rows_index <- attr(df_test_clean, "unknown_category_rows_index")

        # Note: Do not pass df_test like data=df_test. This for some reason ends up predict returning training data prediction.
        model$prediction_test <- predict(model, df_test_clean, se.fit = TRUE)
        model$prediction_test$unknown_category_rows_index <- unknown_category_rows_index
      }
      # these attributes are used in tidy of randomForest TODO: is this good for lm too?
      model$terms_mapping <- terms_mapping
      model$orig_levels <- orig_levels

      # For displaying if sampling happened or not.
      model$sampled_nrow <- sampled_nrow

      # add special lm_exploratory class for adding extra info at glance().
      if (model_type == "glm") {
        class(model) <- c("glm_exploratory", class(model))
        if (with_marginal_effects) { # For now, we have tested marginal_effects for logistic regression only. It seems to fail for probit for example.
          if (smote) {
            model$marginal_effects <- calc_average_marginal_effects(model, data=df_before_smote, with_confint=with_marginal_effects_confint) # This has to be done after glm_exploratory class name is set.
          }
          else {
            model$marginal_effects <- calc_average_marginal_effects(model, with_confint=with_marginal_effects_confint) # This has to be done after glm_exploratory class name is set.
          }
        }
      }
      else {
        class(model) <- c("lm_exploratory", class(model))
      }
      # Calculate variable importances. 
      if (!is.null(model$imp_df) && "error" %nin% class(model$imp_df)) { # if importance is available, show only max_pd_vars most important vars.
        imp_df <- model$imp_df
        imp_vars <- as.character((imp_df %>% arrange(-importance))$variable)
        imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # Keep only max_pd_vars most important variables
      }
      else if (importance_measure == "firm") {
        imp_vars <- c_cols # For FIRM, we need to calculate partial dependence for all predictors first.
      }
      else  {
        # Show only max_pd_vars most significant (ones with the smallest P values) vars.
        # For categorical, pick the smallest P value among all classes of it.
        c_cols_list <- as.list(c_cols)
        # Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
        # always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
        # even for lm.
        coef_df <- broom:::tidy.glm(model)
        # str_detect matches with all categorical class terms that belongs to the variable.
        p_values_list <- c_cols_list %>% purrr::map(function(x){(coef_df %>% filter(stringr::str_detect(term, paste0("^`?", x))) %>% summarize(p.value=min(p.value, na.rm=TRUE)))$p.value})
        p_values_df <- tibble::tibble(term=c_cols, p.value=purrr::flatten_dbl(p_values_list))
        imp_vars <- (p_values_df %>% dplyr::arrange(p.value))$term
        imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # Keep only max_pd_vars most important variables

        # We tried showing only significant variables, but decided oftentimes we wanted to see even insignificant ones. Keeping that code for now.
        #
        # signif_df <- broom::tidy(model) %>% filter(p.value < p_value_threshold) # One-liner to keep only significant predictors by matching names with result of tidy().
        # if (nrow(signif_df) > 0) {
        #   imp_vars <- c_cols[sapply(c_cols,function(x){any(stringr::str_detect(signif_df$term, paste0("^`?", x)))})]
        # }
        # else  {
        #   imp_vars <- c()
        # }
      }

      if (length(imp_vars) > 0) {
        model$imp_vars <- imp_vars
        model$partial_dependence <- partial_dependence.lm_exploratory(model, target=clean_target_col, vars=imp_vars, data=df, n=c(pd_grid_resolution, min(nrow(df), pd_sample_size)))
      }
      else {
        model$partial_dependence <- NULL
      }

      if (importance_measure == "firm") {
        if (length(c_cols) > 1) { # Skip importance calculation if there is only one variable.
          pdp_target_col <- attr(model$partial_dependence, "target")
          imp_df <- importance_firm(model$partial_dependence, pdp_target_col, imp_vars)
          model$imp_df <- imp_df
          imp_vars <- imp_df$variable
          model$imp_vars <- imp_vars
        }
        else {
          error <- simpleError("Variable importance requires two or more variables.")
          model$imp_df <- error
        }

        imp_vars <- imp_vars[1:min(length(imp_vars), max_pd_vars)] # take max_pd_vars most important variables
        model$imp_vars <- imp_vars
        # Shrink the partial dependence data keeping only the important variables.
        model$partial_dependence <- shrink_partial_dependence_data(model$partial_dependence, imp_vars)
      }

      if (!is.null(target_funs)) {
        model$orig_target_col <- target_col
        model$target_funs <- target_funs
      }
      if (!is.null(predictor_funs)) {
        model$orig_predictor_cols <- orig_selected_cols
        attr(predictor_funs, "LC_TIME") <- Sys.getlocale("LC_TIME")
        attr(predictor_funs, "sysname") <- Sys.info()[["sysname"]] # Save platform name (e.g. Windows) since locale name might need conversion for the platform this model will be run on.
        attr(predictor_funs, "lubridate.week.start") <- getOption("lubridate.week.start")
        model$predictor_funs <- predictor_funs
      }

      model$target_col <- clean_target_col # We use target column info to bring the column next to the predicted value in the prediction() output.
      model$orig_predictor_classes <- orig_predictor_classes

      list(model = model, test_index = test_index, source_data = source_data)
    }, error = function(e){
      if(length(grouped_cols) > 0) {
        # In repeat-by case, we report group-specific error in the Summary table,
        # so that analysis on other groups can go on.
        if (model_type == "glm") {
          class(e) <- c("glm_exploratory", class(e))
        }
        else {
          class(e) <- c("lm_exploratory", class(e))
        }
        list(model = e, test_index = NULL, source_data = NULL)
      } else {
        stop(e)
      }
    })
  }

  model_and_data_col <- "model_and_data"
  ret <- do_on_each_group(clean_df, each_func, name = model_and_data_col, with_unnest = FALSE)
  if (length(grouped_cols) > 0) {
    ret <- ret %>% tidyr::nest(-grouped_cols)
  } else {
    ret <- ret %>% tidyr::nest()
  }
  ret <- ret %>% dplyr::ungroup() # Remove rowwise grouping so that following mutate works as expected.
  ret <- ret %>% dplyr::mutate(model = purrr::map(data, function(df){
            df[[model_and_data_col]][[1]]$model
          })) %>%
          dplyr::mutate(.test_index = purrr::map(data, function(df){
            df[[model_and_data_col]][[1]]$test_index
          })) %>%
          dplyr::mutate(source.data = purrr::map(data, function(df){
            data <- df[[model_and_data_col]][[1]]$source_data
            if (length(grouped_cols) > 0 && !is.null(data)) {
              data %>% dplyr::select(-grouped_cols)
            } else {
              data
            }
          })) %>%
          dplyr::select(-data)
  # Rowwise grouping has to be redone with original grouped_cols, so that summarize(tidy(model)) later can add back the group column.
  if (length(grouped_cols) > 0) {
    ret <- ret %>% dplyr::rowwise(grouped_cols)
  } else {
    ret <- ret %>% dplyr::rowwise()
  }
  ret
}

#' special version of glance.lm function to use with build_lm.fast.
#' @export
glance.lm_exploratory <- function(x, pretty.name = FALSE, ...) { #TODO: add test
  if ("error" %in% class(x)) {
    ret <- data.frame(Note = x$message)
    return(ret)
  }
  ret <- broom:::glance.lm(x)

  # Add Max VIF if VIF is available.
  if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
    vif_df <- vif_to_dataframe(x)
    if (nrow(vif_df) > 0 ) {
      max_vif <- max(vif_df$VIF, na.rm=TRUE)
      ret <- ret %>% dplyr::mutate(`Max VIF`=!!max_vif)
    }
  }

  # Adjust the subtle difference between sigma (Residual Standard Error) and RMSE.
  # In RMSE, division is done by observation size, while it is by residual degree of freedom in sigma.
  # https://www.rdocumentation.org/packages/sjstats/versions/0.17.4/topics/cv
  # https://stat.ethz.ch/pipermail/r-help/2012-April/308935.html
  rmse_val <- sqrt(ret$sigma^2 * x$df.residual / nrow(x$model))
  sample_size <- nrow(x$model)
  ret <- ret %>% dplyr::mutate(rmse=!!rmse_val, n=!!sample_size)
  # Drop sigma in favor of rmse.
  ret <- ret %>% dplyr::select(r.squared, adj.r.squared, rmse, statistic, p.value, n, everything(), -sigma)

  if(pretty.name) {
    ret <- ret %>% dplyr::rename(`R Squared`=r.squared, `Adj R Squared`=adj.r.squared, `RMSE`=rmse, `F Value`=statistic, `P Value`=p.value, `DF`=df, `Log Likelihood`=logLik, `Residual Deviance`=deviance, `Residual DF`=df.residual, `Rows`=n)
    # Place the Max VIF column after the Residual Deviance.
    # ret <- ret %>% dplyr::relocate(`Residual DF`, .after=`Residual Deviance`)

    # Note column might not exist. Rename if it is there.
    colnames(ret)[colnames(ret) == "note"] <- "Note"
  }
  if (!is.null(ret$nobs)) { # glance.glm's newly added nobs seems to be same as Number of Rows. Suppressing it for now.
    ret <- ret %>% dplyr::select(-nobs)
  }
  ret
}

#' special version of glance.lm function to use with build_lm.fast.
#' @export
glance.glm_exploratory <- function(x, pretty.name = FALSE, binary_classification_threshold = 0.5, ...) { #TODO: add test
  if ("error" %in% class(x)) {
    ret <- data.frame(Note = x$message)
    return(ret)
  }
  ret <- broom:::glance.glm(x)

  # calculate model p-value since glm does not provide it as is.
  # https://stats.stackexchange.com/questions/129958/glm-in-r-which-pvalue-represents-the-goodness-of-fit-of-entire-model
  f0 <- x$formula # copy formula as a basis for null model.
  attr(f0,".Environment")<-environment() # Since we removed .Environment attribute for size reduction, add it back so that the following process works.
  lazyeval::f_rhs(f0) <- 1 # create null model formula.
  x0 <- glm(f0, x$model, family = x$family) # build null model. Use x$model rather than x$data since x$model seems to be the data after glm handled missingness.
  pvalue <- with(anova(x0,x),pchisq(Deviance,Df,lower.tail=FALSE)[2]) 
  if(pretty.name) {
    ret <- ret %>% dplyr::mutate(`P Value`=!!pvalue, `Rows`=!!length(x$y))
  }
  else {
    ret <- ret %>% dplyr::mutate(p.value=!!pvalue, n=!!length(x$y))
  }

  # For GLM (Negative Binomial)
  if("negbin" %in% class(x)) {
    if(pretty.name) {
      ret <- ret %>% dplyr::mutate(`Theta`=!!(x$theta), `SE Theta`=!!(x$SE.theta))
    }
    else {
      ret <- ret %>% dplyr::mutate(theta=!!(x$theta), SE.theta=!!(x$SE.theta))
    }
  }
  
  if (x$family$family %in% c('gaussian')) { # only for gaussian, which is same as linear regression (if link function is identity). 
    root_mean_square_error <- rmse(x$fitted.values, x$y)
    rsq <- r_squared(x$y, x$fitted.values)
    n_observations <- nrow(x$model)
    df_residual <- x$df.residual
    adj_rsq <- adjusted_r_squared(rsq, n_observations, df_residual)
    ret$rmse <- root_mean_square_error
    ret$r.squared <- rsq
    ret$adj.r.squared <- adj_rsq
  }
  if (x$family$family %in% c('binomial', 'quasibinomial')) { # only for logistic regression.
    # Calculate F Score, Accuracy Rate, Misclassification Rate, Precision, Recall, Number of Rows 
    threshold_value <- if (is.numeric(binary_classification_threshold)) {
      binary_classification_threshold
    } else {
      get_optimized_score(x$y, x$fitted.value, threshold = binary_classification_threshold)$threshold
    }
    if (is.null(threshold_value)) { # Could not get optimized value. To avoid error, it has to have some numeric value.
      threshold_value <- 0.5
    }
    predicted <- ifelse(x$fitted.value > threshold_value, 1, 0) #TODO make threshold adjustable
    ret2 <- evaluate_classification(x$y, predicted, 1, pretty.name = pretty.name)
    ret2 <- ret2[, 2:6]
    ret <- ret %>% bind_cols(ret2)

    # calculate AUC
    ret$auc <- auroc(x$fitted.value, x$y)
    # Show number of rows for positive case and negative case, especially so that result of SMOTE is visible.
    ret$positives <- sum(x$y == 1, na.rm = TRUE)
    ret$negatives <- sum(x$y != 1, na.rm = TRUE)
  }

  # Add Max VIF if VIF is available.
  if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
    vif_df <- vif_to_dataframe(x)
    if (nrow(vif_df) > 0 ) {
      max_vif <- max(vif_df$VIF, na.rm=TRUE)
      ret <- ret %>% dplyr::mutate(`Max VIF`=!!max_vif)
    }
  }

  if(pretty.name) {
    if (x$family$family %in% c('binomial', 'quasibinomial')) { # for binomial regressions.
      colnames(ret)[colnames(ret) == "null.deviance"] <- "Null Deviance"
      colnames(ret)[colnames(ret) == "df.null"] <- "Null Model DF"
      colnames(ret)[colnames(ret) == "logLik"] <- "Log Likelihood"
      colnames(ret)[colnames(ret) == "deviance"] <- "Residual Deviance"
      colnames(ret)[colnames(ret) == "df.residual"] <- "Residual DF"
      colnames(ret)[colnames(ret) == "auc"] <- "AUC"
      
      ret <- ret %>% dplyr::select(AUC, `F1 Score`, `Accuracy Rate`, `Misclass. Rate`, `Precision`, `Recall`, `P Value`, `Rows`, positives, negatives,  `Log Likelihood`, `AIC`, `BIC`, `Residual Deviance`, `Residual DF`, `Null Deviance`, `Null Model DF`, everything())

      if (!is.null(x$orig_levels)) { 
        pos_label <- x$orig_levels[2]
        neg_label <- x$orig_levels[1]
      }
      else {
        # This should be only numeric case.
        # In case of 0 and 1, this is making sense.
        # But it seems the input can be numbers between 0 and 1 like 0.5 too.
        # TODO: Look into how to handle such case.
        pos_label <- "TRUE"
        neg_label <- "FALSE"
      }
      colnames(ret)[colnames(ret) == "positives"] <- paste0("Rows for ", pos_label)
      colnames(ret)[colnames(ret) == "negatives"] <- paste0("Rows for ", neg_label)
    }
    else { # for other numeric regressions.
      colnames(ret)[colnames(ret) == "null.deviance"] <- "Null Deviance"
      colnames(ret)[colnames(ret) == "df.null"] <- "Null Model DF"
      colnames(ret)[colnames(ret) == "logLik"] <- "Log Likelihood"
      colnames(ret)[colnames(ret) == "deviance"] <- "Residual Deviance"
      colnames(ret)[colnames(ret) == "df.residual"] <- "Residual DF"
      colnames(ret)[colnames(ret) == "rmse"] <- "RMSE"
      colnames(ret)[colnames(ret) == "r.squared"] <- "R Squared"
      colnames(ret)[colnames(ret) == "adj.r.squared"] <- "Adj R Squared"

      ret <- ret %>% dplyr::select(matches("^R Squared$"), matches("^Adj R Squared$"), matches("^RMSE$"), `P Value`, `Rows`, `Log Likelihood`, `AIC`, `BIC`, `Residual Deviance`, `Residual DF`, `Null Deviance`, `Null Model DF`, everything())
    }
  }
  if (!is.null(ret$nobs)) { # glance.glm's newly added nobs seems to be same as Number of Rows. Suppressing it for now.
    ret <- ret %>% dplyr::select(-nobs)
  }

  ret
}

# Creates a data frame that maps term name to its base level.
xlevels_to_base_level_table <- function(xlevels) {
  term <- purrr::flatten_chr(purrr::map(names(xlevels), function(vname) {
    # Quote variable name with backtick if it includes special characters or space.
    # Special characters to detect besides space. Note that period and underscore should *not* be included here. : ~!@#$%^&*()+={}|:;'<>,/?"[]-\
    # Using grepl() as opposed to str_detect() because str_detect seems to return wrong decision when vname ends with SJIS damemoji.
    # perl=TRUE is required here, since it seems this regex does not detect space, tilde, etc. without perl=TRUE for some reason.
    paste0(if_else(grepl("[ ~!@#$%^&*()+={}|:;'<>,/?\"\\[\\]\\-\\\\]", vname, perl=TRUE), paste0('`',vname,'`'),vname),xlevels[[vname]])
  }))
  base_level <- purrr::flatten_chr(purrr::map(xlevels, function(v){rep(v[[1]],length(v))}))
  ret <- data.frame(term=term, base.level=base_level)
  ret
}

# Takes lm/glm model with vif (variance inflation factor) and returns data frame with extracted info.
vif_to_dataframe <- function(x) {
  ret <- NULL
  if (is.matrix(x$vif)) {
    # It is a matrix when there are categorical variables.
    ret <- x$vif %>% as.data.frame() %>% tibble::rownames_to_column(var="term") %>% rename(VIF=GVIF)
  }
  else {
    # It is a vector when there is no categorical variable.
    ret <- data.frame(term=names(x$vif), VIF=x$vif)
  }
  # Eliminate negative VIF values that sometimes happen, most likely because of numeric error.
  ret <- ret %>% dplyr::mutate(VIF = pmax(VIF, 0))
  # Map variable names back to the original.
  # as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
  ret$term <- x$terms_mapping[as.character(ret$term)]
  ret
}

# From name of variable, returns possible names of terms returned from lm/glm.
var_to_possible_terms_lm <- function(var, x) {
  if (is.factor(x$model[[var]])) {
    # Possibly, the variable name in the term name is quoted with backtic.
    c(paste0(var, levels(x$model[[var]])),
      paste0('`', var, '`', levels(x$model[[var]])))
  }
  else {
    # Possibly, the term name is quoted with backtic.
    c(var, paste0('`', var, '`'))
  }
}

# From name of variable, returns possible names of terms returned from coxph.
var_to_possible_terms_coxph <- function(var, x) {
  if (!is.null(x$xlevels[[var]])) {
    # Possibly, the variable name in the term name is quoted with backtic.
    paste0(var, x$xlevels[[var]])
  }
  else {
    # Possibly, the term name is quoted with backtic.
    var
  }
}

# Returns P-value for the variable. For categorical, the smallest value is returned.
# For the color of relative importance bar chart.
get_var_min_pvalue <- function(var, coef_df, x) {
  if ("coxph" %in% class(x)) {
    terms <- var_to_possible_terms_coxph(as.character(var), x)
  }
  else { # We assume it is lm or glm here.
    terms <- var_to_possible_terms_lm(as.character(var), x)
  }
  # na.rm is necessary to handle the case where part of categorical variables are dropped due to perfect collinearity.
  min(coef_df$p.value[coef_df$term %in% terms], na.rm=TRUE)
}

#' special version of tidy.lm function to use with build_lm.fast.
#' @export
tidy.lm_exploratory <- function(x, type = "coefficients", pretty.name = FALSE, ...) { #TODO: add test
  if ("error" %in% class(x)) {
    ret <- data.frame()
    return(ret)
  }
  switch(type,
    coefficients = {
      # Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
      # always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
      # even for lm.
      ret <- broom:::tidy.glm(x)
      ret <- ret %>% mutate(conf.low=estimate-1.96*std.error, conf.high=estimate+1.96*std.error)
      base_level_table <- xlevels_to_base_level_table(x$xlevels)
      # Convert term from factor to character to remove warning at left_join.
      ret <- ret %>% dplyr::mutate(term=as.character(term)) %>% dplyr::left_join(base_level_table, by="term")
      if (any(is.na(x$coefficients))) {
        # since broom skips coefficients with NA value, which means removed by lm because of multi-collinearity,
        # put it back to show them.
        # reference: https://stats.stackexchange.com/questions/25804/why-would-r-return-na-as-a-lm-coefficient
        removed_coef_df <- data.frame(term=names(x$coefficients[is.na(x$coefficients)]), note="Dropped most likely due to perfect multicollinearity.")
        ret <- ret %>% dplyr::bind_rows(removed_coef_df)
        if (pretty.name) {
          ret <- ret %>% rename(Note=note)
        }
      }
      # Map coefficient names back to original.
      ret$term <- map_terms_to_orig(ret$term, x$terms_mapping)
      if (pretty.name) {
        ret <- ret %>% rename(Term=term, Coefficient=estimate, `Std Error`=std.error,
                              `t Value`=statistic, `P Value`=p.value,
                              `Conf Low`=conf.low,
                              `Conf High`=conf.high,
                              `Base Level`=base.level)
      }
      ret
    },
    vif = {
      if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
        ret <- vif_to_dataframe(x)
      }
      else {
        ret <- data.frame() # Skip output for this group. TODO: Report error in some way.
      }
      ret
    },
    partial_dependence = {
      handle_partial_dependence(x)
    },
    importance = {
      if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
        # Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
        # Return empty data.frame to avoid error.
        ret <- data.frame()
        return(ret)
      }
      ret <- x$imp_df
      # Add p.value column.
      # Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
      # always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
      # even for lm.
      coef_df <- broom:::tidy.glm(x)
      ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
        get_var_min_pvalue(var, coef_df, x)
      })) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
      # Map variable names back to the original.
      # as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
      ret$variable <- x$terms_mapping[as.character(ret$variable)]
      ret
    },
    # This is copy of the code for "importance" with adjustment for output column name just for backward compatibility for pre-6.5.
    # Remove at appropriate timing.
    permutation_importance = {
      if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
        # Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
        # Return empty data.frame to avoid error.
        ret <- data.frame()
        return(ret)
      }
      ret <- x$imp_df
      # Add p.value column.
      # Since broom:::tidy.lm raises :Error: No tidy method for objects of class lm_exploratory",
      # always use broom:::tidy.glm which does not have this problem, and seems to return the same result,
      # even for lm.
      coef_df <- broom:::tidy.glm(x)
      ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
        get_var_min_pvalue(var, coef_df, x)
      })) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
      # Map variable names back to the original.
      # as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
      ret$variable <- x$terms_mapping[as.character(ret$variable)]
      ret <- ret %>% dplyr::rename(term=variable)
      ret
    }
  )
}

#' Special version of tidy.glm function to use with build_lm.fast.
#' In case of error, returns empty data frame, or data frame with Note column.
#' @export
tidy.glm_exploratory <- function(x, type = "coefficients", pretty.name = FALSE, variable_metric = NULL, converged_only = FALSE, ...) { #TODO: add test
  if ("error" %in% class(x)) {
    ret <- data.frame()
    return(ret)
  }
  # Skip if model did not converge. We are using this to skip coefficient scatter plot with very large coefficients,
  # or if odds ratio is used, Inf or 0 odds ratios.
  if (converged_only && !x$converged) {
    ret <- data.frame()
    return(ret)
  }
  switch(type,
    coefficients = {
      ret <- broom:::tidy.glm(x)
      ret <- ret %>% mutate(conf.low=estimate-1.96*std.error, conf.high=estimate+1.96*std.error)
      if (x$family$family == "binomial" && x$family$link == "logit") { # odds ratio is only for logistic regression
        ret <- ret %>% mutate(odds_ratio=exp(estimate))
        if (!is.null(variable_metric) && variable_metric == "odds_ratio") { # For Analytics View, overwrite conf.low/conf.high with those of odds ratio.
          ret <- ret %>% mutate(conf.low=exp(conf.low), conf.high=exp(conf.high))
          ret <- ret %>% select(term, odds_ratio, conf.low, conf.high, everything()) # Bring odds ratio upfront together with its confidence interval.
        }
        else {
          ret <- ret %>% select(term, estimate, conf.low, conf.high, everything()) # Bring coefficient upfront together with its confidence interval.
        }
      }
      if (!is.null(x$marginal_effects)) {
        # Convert term from factor to character to remove warning at left_join.
        ret <- ret %>% dplyr::mutate(term=as.character(term)) %>% dplyr::left_join(x$marginal_effects, by="term")
        # Adjust order of columns
        if (!is.null(ret$ame)) {
          ret <- ret %>% relocate(ame, .after=term)
          if (!is.null(ret$ame_low)) {
            ret <- ret %>% relocate(ame_high, ame_low, .after=ame)
          }
        }
      }
      base_level_table <- xlevels_to_base_level_table(x$xlevels)
      # Convert term from factor to character to remove warning at left_join.
      ret <- ret %>% dplyr::mutate(term=as.character(term)) %>% dplyr::left_join(base_level_table, by="term")
      if (any(is.na(x$coefficients))) {
        # since broom skips coefficients with NA value, which means removed by lm because of multi-collinearity,
        # put it back to show them.
        # reference: https://stats.stackexchange.com/questions/25804/why-would-r-return-na-as-a-lm-coefficient
        removed_coef_df <- data.frame(term=names(x$coefficients[is.na(x$coefficients)]), note="Dropped most likely due to perfect multicollinearity.")
        ret <- ret %>% dplyr::bind_rows(removed_coef_df)
        if (pretty.name) {
          ret <- ret %>% rename(Note=note)
        }
      }
      # Map coefficient names back to original.
      ret$term <- map_terms_to_orig(ret$term, x$terms_mapping)
      if (pretty.name) {
        # Rename to pretty names
        ret <- ret %>% rename(Term=term, Coefficient=estimate, `Std Error`=std.error,
                              `t Value`=statistic, `P Value`=p.value, `Conf Low`=conf.low, `Conf High`=conf.high,
                              `Base Level`=base.level)
        if (!is.null(ret$ame)) {
          ret <- ret %>% rename(`Average Marginal Effect`=ame)
        }
        if (!is.null(ret$ame_low)) {
          ret <- ret %>% rename(`AME Low`=ame_low,`AME High`=ame_high)
        }
        if (x$family$family == "binomial" && x$family$link == "logit") { # odds ratio is only for logistic regression
          ret <- ret %>% rename(`Odds Ratio`=odds_ratio)
        }
      }
      ret
    },
    conf_mat = {
      target_col <- as.character(lazyeval::f_lhs(x$formula)) # get target column name
      actual_val = x$model[[target_col]]

      predicted = x$fitted.value > 0.5 # TODO: make threshold adjustable. Note: This part of code seems to be unused. Check and remove.
      # convert predicted to original set of values. should be either logical, numeric, or factor.
      predicted <- if (is.logical(actual_val)) {
        predicted
      } else if (is.numeric(actual_val)) {
        as.numeric(predicted)
      } else if (is.factor(actual_val)){
        # create a factor vector with the same levels as actual_val
        # predicted is logical, so should +1 to make it index
        factor(levels(actual_val)[as.numeric(predicted) + 1], levels(actual_val))
      }

      ret <- data.frame(
        actual_value = actual_val,
        predicted_value = predicted
      ) %>%
        dplyr::filter(!is.na(predicted_value))

      # get count if it's classification
      ret <- ret %>%
        dplyr::group_by(actual_value, predicted_value) %>%
        dplyr::summarize(count = n()) %>%
        dplyr::ungroup()
      ret
    },
    vif = {
      if (!is.null(x$vif) && "error" %nin% class(x$vif)) {
        ret <- vif_to_dataframe(x)
      }
      else {
        ret <- data.frame() # Skip output for this group. TODO: Report error in some way.
      }
      ret
    },
    partial_dependence = {
      handle_partial_dependence(x)
    },
    importance = {
      if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
        # Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
        # Return empty data.frame to avoid error.
        ret <- data.frame()
        return(ret)
      }
      ret <- x$imp_df
      # Add p.value column.
      coef_df <- broom:::tidy.glm(x)
      ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
        get_var_min_pvalue(var, coef_df, x)
      })) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
      # Map variable names back to the original.
      # as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
      ret$variable <- x$terms_mapping[as.character(ret$variable)]
      ret
    },
    # This is copy of the code for "importance" with adjustment for output column name just for backward compatibility for pre-6.5.
    # Remove at appropriate timing.
    permutation_importance = {
      if (is.null(x$imp_df) || "error" %in% class(x$imp_df)) {
        # Permutation importance is not supported for the family and link function, or skipped because there is only one variable.
        # Return empty data.frame to avoid error.
        ret <- data.frame()
        return(ret)
      }
      ret <- x$imp_df
      # Add p.value column.
      coef_df <- broom:::tidy.glm(x)
      ret <- ret %>% dplyr::mutate(p.value=purrr::map(variable, function(var) {
        get_var_min_pvalue(var, coef_df, x)
      })) %>% dplyr::mutate(p.value=as.numeric(p.value)) # Make list into numeric vector.
      # Map variable names back to the original.
      # as.character is to be safe by converting from factor. With factor, reverse mapping result will be messed up.
      ret$variable <- x$terms_mapping[as.character(ret$variable)]
      ret <- ret %>% dplyr::rename(term=variable)
      ret
    }
  )
}

#' wrapper for tidy type partial dependence
#' @export
lm_partial_dependence <- function(df, ...) { # TODO: write test for this.
  res <- df %>% tidy_rowwise(model, type="partial_dependence", ...)
  if (nrow(res) == 0) {
    return(data.frame()) # Skip the rest of processing by returning empty data.frame.
  }
  grouped_col <- grouped_by(res) # When called from analytics view, this should be a single column or empty.
                                 # grouped_by has to be on res rather than on df since dplyr::group_vars
                                 # does not work on rowwise-grouped data frame.

  if (length(grouped_col) > 0) {
    res <- res %>% dplyr::ungroup() # ungroup to mutate group_by column.
    # Folloing is not necessary since we separately display partial dependence plot for each group since v5.5.
    # add variable name to the group_by column, so that chart is repeated by the combination of group_by column and variable name.
    # res[[grouped_col]] <- paste(as.character(res[[grouped_col]]), res$x_name)

    res[[grouped_col]] <- forcats::fct_inorder(factor(res[[grouped_col]])) # set order to appear as facets
    res <- res %>% dplyr::group_by(!!!rlang::syms(grouped_col)) # put back group_by for consistency
  }
  else {
    res$x_name <- forcats::fct_inorder(factor(res$x_name)) # set order to appear as facets
  }
  # gather we did after edarf::partial_dependence call turned x_value into factor if not all variables were in a same data type like numeric.
  # to keep the numeric or factor order (e.g. Sun, Mon, Tue) of x_value in the resulting chart, we do fct_inorder here while x_value is in order.
  # the first factor() is for the case x_value is not already a factor, to avoid error from fct_inorder()
  res <- res %>% dplyr::mutate(x_value = forcats::fct_inorder(factor(x_value))) # TODO: if same number appears for different variables, order will be broken.
  res
}

#' @export
augment.lm_exploratory <- function(x, data = NULL, newdata = NULL, data_type = "training", ...) {
  if ("error" %in% class(x)) {
    ret <- data.frame()
    return(ret)
  }

  if(!is.null(newdata)) {
    # Replay the mutations on predictors.
    if(!is.null(x$predictor_funs)) {
      newdata <- newdata %>% mutate_predictors(x$orig_predictor_cols, x$predictor_funs)
    }

    predictor_variables <- all.vars(x$terms)[-1]
    predictor_variables_orig <- x$terms_mapping[predictor_variables]

    # Rename columns via predictor_variables_orig, which is a named vector.
    # TODO: What if names of the other columns conflicts with our temporary name, c1_, c2_...?
    cleaned_data <- newdata %>% dplyr::rename(predictor_variables_orig)

    # Align factor levels including Others and (Missing) to the model. TODO: factor level order can be different from the model training data. Is this ok?
    cleaned_data <- align_predictor_factor_levels(cleaned_data, x$xlevels, predictor_variables)

    na_row_numbers <- ranger.find_na(predictor_variables, cleaned_data)
    if (length(na_row_numbers) > 0) {
      cleaned_data <- cleaned_data[-na_row_numbers,]
    }
    ret <- broom:::augment.lm(x, data = NULL, newdata = cleaned_data, se = TRUE, ...)
    # TODO: Restore removed rows.
  } else if (!is.null(data)) {
    data <- data %>% dplyr::relocate(!!rlang::sym(x$target_col), .after = last_col()) # Bring the target column to the last so that it is next to the predicted value in the output.
    ret <- switch(data_type,
      training = { # Call broom:::augment.lm as is
        broom:::augment.lm(x, data = data, newdata = newdata, se = TRUE, ...)
      },
      test = {
        # Augment data with already predicted result in the model.
        data$.fitted <- restore_na(x$prediction_test$fit, x$prediction_test$unknown_category_rows_index)
        data$.se.fit <- restore_na(x$prediction_test$se.fit, x$prediction_test$unknown_category_rows_index)
        data
      })
  }
  else {
    ret <- broom:::augment.lm(x, se = TRUE, ...)
  }
  # Rename columns back to the original names.
  names(ret) <- coalesce(x$terms_mapping[names(ret)], names(ret))
  ret
}

#' @export
augment.glm_exploratory <- function(x, data = NULL, newdata = NULL, data_type = "training", binary_classification_threshold = 0.5, ...) {
  if ("error" %in% class(x)) {
    ret <- data.frame()
    return(ret)
  }
  if(!is.null(newdata)) {
    # Replay the mutations on predictors.
    if(!is.null(x$predictor_funs)) {
      newdata <- newdata %>% mutate_predictors(x$orig_predictor_cols, x$predictor_funs)
    }

    predictor_variables <- all.vars(x$terms)[-1]
    predictor_variables_orig <- x$terms_mapping[predictor_variables]

    # Rename columns via predictor_variables_orig, which is a named vector.
    # TODO: What if names of the other columns conflicts with our temporary name, c1_, c2_...?
    cleaned_data <- newdata %>% dplyr::rename(predictor_variables_orig)

    # Align factor levels including Others and (Missing) to the model. TODO: factor level order can be different from the model training data. Is this ok?
    cleaned_data <- align_predictor_factor_levels(cleaned_data, x$xlevels, predictor_variables)

    na_row_numbers <- ranger.find_na(predictor_variables, cleaned_data)
    if (length(na_row_numbers) > 0) {
      cleaned_data <- cleaned_data[-na_row_numbers,]
    }

    ret <- tryCatch({
      broom:::augment.glm(x, data = NULL, newdata = cleaned_data, se = TRUE, ...)
    }, error = function(e){
      # se=TRUE throws error that looks like "singular matrix 'a' in solve",
      # in some subset of cases of perfect collinearity.
      # Try recovering from it by running with se=FALSE.
      broom:::augment.glm(x, data = NULL, newdata = cleaned_data, se = FALSE, ...)
    })
  } else if (!is.null(data)) {
    # Bring the target column to the last so that it is next to the predicted value in the output.
    # Note that source.data of lm/glm model df has mapped column names, unlike that of ranger.
    data <- data %>% dplyr::relocate(!!rlang::sym(x$target_col), .after = last_col())
    ret <- switch(data_type,
      training = {
        tryCatch({
          broom:::augment.glm(x, data = data, newdata = newdata, se = TRUE, ...)
        }, error = function(e){
          broom:::augment.glm(x, data = data, newdata = newdata, se = FALSE, ...)
        })
      },
      test = {
        # Augment data with already predicted result in the model.
        data$.fitted <- restore_na(x$prediction_test$fit, x$prediction_test$unknown_category_rows_index)
        data$.se.fit <- restore_na(x$prediction_test$se.fit, x$prediction_test$unknown_category_rows_index)
        data
      })
  }
  else {
    ret <- tryCatch({
      broom:::augment.glm(x, se = TRUE, ...)
    }, error = function(e){
      broom:::augment.glm(x, se = FALSE, ...)
    })
  }

  ret <- add_response(ret, x, "predicted_response") # Add response.
  if (!is.null(ret$.fitted)) {
    # Bring predicted_response column as the first of the prediction result related additional columns, so that it comes next to the actual values.
    ret <- ret %>% dplyr::relocate(any_of(c("predicted_response")), .before=.fitted)
  }

  if (x$family$family == "binomial") { # Add predicted label in case of binomial (including logistic regression).
    ret$predicted_label <- ret$predicted_response > binary_classification_threshold
  }
  # Rename columns back to the original names.
  names(ret) <- coalesce(x$terms_mapping[names(ret)], names(ret))
  ret
}

# For some reason, find_data called from inside margins::marginal_effects() fails in Exploratory.
# Explicitly declaring find_data for our glm_exploratory class works it around.
#' @export
find_data.glm_exploratory <- function(model, env = parent.frame(), ...) {
  model$data
}

# Generates Summary table for Analytics View. It can handle Test Mode.
# This is written for linear regression analytics view and GLM analytics views that makes numeric prediction.
evaluate_lm_training_and_test <- function(df, pretty.name = FALSE){
  # Get the summary row for traninng data. Info is retrieved from model by glance()
  training_ret <- df %>% glance_rowwise(model, pretty.name = pretty.name)
  ret <- training_ret

  grouped_col <- colnames(df)[!colnames(df) %in% c("model", ".test_index", "source.data")]

  # Consider it test mode if any of the element of .test_index column has non-zero length, and generate summary row for test data.
  # Unlike training data, this involves calculating metrics by ourselves from test prediction result.
  if (purrr::some(df$.test_index, function(x){length(x)!=0})) {
    ret$is_test_data <- FALSE # Set is_test_data FALSE for training data. Add is_test_data column only when there are test data too.
    each_func <- function(df){
      # With the way this is called, df becomes list rather than data.frame.
      # Make it data.frame again so that prediction() can be applied on it.
      if (!is.data.frame(df)) {
        df <- tibble::tribble(~model, ~.test_index, ~source.data,
                              df$model, df$.test_index, df$source.data)
      }

      tryCatch({
        test_pred_ret <- prediction(df, data = "test")
        ## get Model Object
        m <- (df %>% filter(!is.null(model)))$model[[1]]
        actual_val_col <- all.vars(df$model[[1]]$terms)[[1]]
        # Get original target column name.
        actual_val_col_orig <- df$model[[1]]$terms_mapping[[actual_val_col]]

        actual <- test_pred_ret[[actual_val_col_orig]]
        predicted <- test_pred_ret$predicted_value
        root_mean_square_error <- rmse(actual, predicted)
        test_n <- sum(!is.na(predicted)) # Sample size for test.

        # To calculate R Squared for test data, use same null model basis as training,
        # so that the results are comparable.
        null_model_mean <- mean(df$model[[1]]$model[[actual_val_col]], na.rm=TRUE)

        rsq <- r_squared(actual, predicted, null_model_mean)

        # Calculate Adjusted R Sauared
        # https://en.wikipedia.org/wiki/Coefficient_of_determination
        n_observations <- nrow(df$model[[1]]$model)
        df_residual <- df$model[[1]]$df.residual
        adj_rsq <- adjusted_r_squared(rsq, n_observations, df_residual)

        test_ret <- data.frame(
                          r.squared = rsq,
                          adj.r.squared = adj_rsq,
                          rmse = root_mean_square_error,
                          n = test_n
                          )
        if(pretty.name) {
          test_ret <- test_ret %>% dplyr::rename(`R Squared`=r.squared, `Adj R Squared`=adj.r.squared, `RMSE`=rmse, `Rows`=n)
        }
        test_ret$is_test_data <- TRUE
        test_ret
      }, error = function(e){
        data.frame()
      })
    }

    # df is already grouped rowwise, but to get group column value on the output, we need to group it explicitly with the group column.
    if (length(grouped_col) > 0) {
      df <- df %>% dplyr::group_by(!!!rlang::syms(grouped_col))
    }

    test_ret <- do_on_each_group(df, each_func, with_unnest = TRUE)
    ret <- ret %>% dplyr::bind_rows(test_ret)
  }

  # Reorder columns. Bring group_by column first, and then is_test_data column, if it exists.
  if (!is.null(ret$is_test_data)) {
    if (length(grouped_col) > 0) {
      ret <- ret %>% dplyr::select(!!!rlang::syms(grouped_col), is_test_data, everything())
    }
    else {
      ret <- ret %>% dplyr::select(is_test_data, everything())
    }
  }
  else {
    if (length(grouped_col) > 0) {
      ret <- ret %>% dplyr::select(!!!rlang::syms(grouped_col), everything())
    }
  }

  if (length(grouped_col) > 0){
    ret <- ret %>% dplyr::arrange(!!!rlang::syms(grouped_col))
  }

  # Prettify is_test_data column. Do this after the above select calls, since it looks at is_test_data column.
  if (!is.null(ret$is_test_data) && pretty.name) {
    ret <- ret %>% dplyr::select(is_test_data, everything()) %>%
      dplyr::mutate(is_test_data = dplyr::if_else(is_test_data, "Test", "Training")) %>%
      dplyr::rename(`Data Type` = is_test_data)
  }

  # Bring Note column at the end.
  if (!is.null(ret$Note)) {
    ret <- ret %>% dplyr::select(-Note, everything(), Note)
  }

  ret
}
exploratory-io/exploratory_func documentation built on April 23, 2024, 9:15 p.m.