rNuggets: Useful Helper Functions for (Academic) Data Analysis

Documented in .is.numeric_col run_lm summary.rN_lm tunnel

#' Tunnel a dataframe through a function for side effects
#'
#' Within a dplyr-pipe, you might want to print a summary statistic or run some
#' other function before continuing to process or save the original data. This
#' function does that by taking a dataframe and function as arguments, printing
#' the result of the function and returning the dataframe for further processing.
#'
#' @param df A dataframe
#' @param fun A function that is applied to the dataframe and the
#' output of which is printed. If the argument is not passed, df is
#' merely printed and not returned, which only makes sense in combination
#' with the `note` argument
#' @param ... Arguments passed on to \code{fun}
#' @param note A string that is printed above the \code{fun}-output
#' @param return Logical - should df be returned?
#' @return The original dataframe, unless \code{return} is set to FALSE. In that
#'   case, NULL is returned invisibly, so that the function can be used to just
#'   add a note above the output of another function.
#' @source Inspired by the magrittr \code{\%T>\%} operator that promises similar
#'   functionality but often didn't quite fit my needs
#' @export
#' @examples
#' library(magrittr)
#' x <- 1:5 %>%
#'   tunnel(mean, note = "Mean") %>%
#'   tunnel(sd, note = "SD")
#' x
tunnel <- function(df, fun, ..., note = NULL, return = TRUE) {
  if (missing(fun)) {
    fun <- function(x) x
    return <- FALSE
  }

  if (!is.null(note)) print(note)
  print(fun(df, ...))
  if (return) {
    return(df)
  } else {
    invisible(NULL)
  }
}

#' lm() for pipes - data as first argument
#'
#' Within a dplyr-pipe, running lm() is often complicated be the placing of the
#' data argument. This wrapper places data first and allows to run standardized
#' models.
#'
#' Note that the model call in the lm-object is replaced by the call to this
#' function - that means that \code{update()} cannot be used.
#'
#' @param df Data for modeling
#' @param std Logical. Should variables be standardised? This is only applied to
#' numeric variables, factors are left unchanged so that their coefficients
#' remain interpretable.
#' @param rename_std Logical. Should standardised variables be indicated by _sd
#' suffix
#' @inheritParams stats::lm
#' @inheritDotParams stats::lm -data
#' @source After experiencing an issue with passing weights, I rewrote this
#' based on the code suggested by "Vandenman" here
#' https://stackoverflow.com/questions/38683076/ellipsis-trouble-passing-to-lm
#' @references See (Fox, 2015) for an argument why dummy variables should never
#' be standardised. If you want to run a model with all variables standardised,
#' one option is `QuantPsyc::lm.beta()`
#' @export

run_lm <- function(df, formula, std = FALSE, rename_std = FALSE, ...) {
  if (std) {
    vars <- all.vars(formula)
    vars_num <- vars[purrr::map_lgl(vars, .is.numeric_col, df = df)]

    if (rename_std) {
      df <- df %>% dplyr::mutate_at(vars_num, list(sd = scale_blank))


      repl <- paste0(vars_num, "_sd")
      names(repl) <- vars_num
      formula <- Reduce(paste, deparse(formula)) %>%
        stringr::str_replace_all(c(repl)) %>%
        as.formula()
    } else {
      df <- df %>% dplyr::mutate_at(vars_num, list(scale_blank))
    }
  }

  # get names of stuff in ...
  arg_names <- sapply(substitute(list(...))[-1L], deparse)
  # look for identical names in df
  m <- match(names(df), arg_names, 0L)

  # store other arguments from ... in a list, if any
  dot_args <- eval(parse(text = arg_names[-m]))
  if (is.null(dot_args)) {
    args <- list()
  } else {
    args <- list(dot_args)
    # name the list
    names(args) <- names(arg_names[-m])
  }

  # store complete values in args, instead of just references to columns
  # the unlist code is rather ugly, the goal is to create a list where every
  # element is a column of interest
  args[names(arg_names)[m]] <- unlist(apply(
    df[, as.logical(m), drop = FALSE],
    2, list
  ), recursive = FALSE)
  # also put other stuff in there
  args$formula <- formula
  args$data <- df
  # do lm
  mod <- do.call(lm, args)
  class(mod) <- c(class(mod), "rN_lm")
  if (std) {
    mod$call_fmt <- c(sys.call(), "Note: DV and continuous IVs were standardised")
    class(mod) <- c(class(mod), "rN_std")
  } else {
    mod$call_fmt <- c(sys.call())
  }
  mod
}

#' Summary of an lm object created with run_lm() wrapper
#'
#' Using \code{\link{run_lm}} creates a very unwieldy call to lm(). This
#' function replaces it by a more legible call in the `summary()`-output
#'
#' @param object A model with class `rN_lm`
#' @param ... Parameters passed down to summary and print
#' @inheritDotParams stats::summary.lm
#' @export

summary.rN_lm <- function(object, ...) {
  out <- stats::summary.lm(object, ...)
  out$call <- object$call_fmt
  out
}

#' Tests whether a column in df, specificied by string, is numeric
#'
#' @param col Character indicating column name
#' @param df Dataframe that contains `col`

.is.numeric_col <- function(col, df) {
  is.numeric(magrittr::extract2(df, col))
}